import pandas as pd
import pandas as pd
# naming the data sources
url1 = "https://raw.githubusercontent.com/Oyeniran20/axia_cohort_8/refs/heads/main/trainperf.csv"
url2 = "https://raw.githubusercontent.com/Oyeniran20/axia_cohort_8/refs/heads/main/traindemographics.csv"
url3 = "https://raw.githubusercontent.com/Oyeniran20/axia_cohort_8/refs/heads/main/trainprevloans.csv"
# import all the necessary libraries
!pip install xgboost --quiet
!pip install catboost --quiet
!pip install shap --quiet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import time
import shap
import joblib
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:.2f}'.format)
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold # feature selector
# Import the Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
# import pipelines
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline # important: from imblearn, not sklearn
from imblearn.over_sampling import SMOTE
# import metrics
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.2/99.2 MB 7.3 MB/s eta 0:00:00
# importing the performance dataset
df_perf = pd.read_csv(url1)
df_perf.head()
df_perf
# Convert datetime columns to proper datetime format
df_perf['approveddate'] = pd.to_datetime(df_perf['approveddate'])
df_perf['creationdate'] = pd.to_datetime(df_perf['creationdate'])
# Extract just the time portion and create new columns
df_perf['approved_time'] = df_perf['approveddate'].dt.time
df_perf['creation_time'] = df_perf['creationdate'].dt.time
# Check the results
print("Original columns with date and time:")
print(df_perf[['approveddate', 'creationdate']].head())
Original columns with date and time:
approveddate creationdate
0 2017-07-25 08:22:56 2017-07-25 07:22:47
1 2017-07-05 17:04:41 2017-07-05 16:04:18
2 2017-07-06 14:52:57 2017-07-06 13:52:51
3 2017-07-27 19:00:41 2017-07-27 18:00:35
4 2017-07-03 23:42:45 2017-07-03 22:42:39
# Remove time portion from the original datetime columns (keep only date)
df_perf['approveddate'] = df_perf['approveddate'].dt.date
df_perf['creationdate'] = df_perf['creationdate'].dt.date
# Check the results
print("Updated columns - dates only:")
print(df_perf[['approveddate', 'creationdate']].head())
print("\nTime columns we created earlier:")
print(df_perf[['approved_time', 'creation_time']].head())
# View the complete result
df_perf[['approveddate', 'approved_time', 'creationdate', 'creation_time']].head()
df_perf.head()
df_perf
# importing the demographics dataset
df_demo = pd.read_csv(url2)
df_demo.head()
df_demo
# First make sure birthdate is in datetime format
df_demo['birthdate'] = pd.to_datetime(df_demo['birthdate'])
# Create a new column for birth year
df_demo['birth_year'] = df_demo['birthdate'].dt.year
# Now convert birthdate back to date only (remove time portion)
df_demo['birthdate'] = df_demo['birthdate'].dt.date
# Check the results
print("Birthdate with extracted year:")
print(df_demo[['customerid', 'birthdate', 'birth_year']].head())
# View the updated dataframe
df_demo.head()
df_demo
# importing the previous loand dataset
df_prevloans = pd.read_csv(url3)
df_prevloans.head()
df_prevloans
# Convert all datetime columns to proper datetime format first
datetime_columns = ['approveddate', 'creationdate', 'closeddate', 'firstduedate', 'firstrepaiddate']
# Step 1: Convert to datetime format
for col in datetime_columns:
if col in df_prevloans.columns:
df_prevloans[col] = pd.to_datetime(df_prevloans[col])
# Step 2: Extract time portions into new columns
df_prevloans['approved_time'] = df_prevloans['approveddate'].dt.time
df_prevloans['creation_time'] = df_prevloans['creationdate'].dt.time
df_prevloans['closed_time'] = df_prevloans['closeddate'].dt.time
df_prevloans['firstdue_time'] = df_prevloans['firstduedate'].dt.time
df_prevloans['firstrepaid_time'] = df_prevloans['firstrepaiddate'].dt.time
# Step 3: Convert original columns to date only (remove time)
for col in datetime_columns:
if col in df_prevloans.columns:
df_prevloans[col] = df_prevloans[col].dt.date
# Check the results
print("Original date columns (now date only):")
print(df_prevloans[['approveddate', 'creationdate', 'closeddate', 'firstduedate', 'firstrepaiddate']].head())
print("\nNew time columns:")
print(df_prevloans[['approved_time', 'creation_time', 'closed_time', 'firstdue_time', 'firstrepaid_time']].head())
# Display updated info
print(f"\nDataset shape: {df_prevloans.shape}")
print("All columns:", df_prevloans.columns.tolist())
Original date columns (now date only): approveddate creationdate closeddate firstduedate firstrepaiddate 0 2016-08-15 2016-08-15 2016-09-01 2016-09-14 2016-09-01 1 2017-04-28 2017-04-28 2017-05-28 2017-05-30 2017-05-26 2 2017-03-05 2017-03-05 2017-04-26 2017-04-04 2017-04-26 3 2017-04-09 2017-04-09 2017-04-24 2017-04-24 2017-04-24 4 2017-06-17 2017-06-17 2017-07-14 2017-07-03 2017-07-14 New time columns: approved_time creation_time closed_time firstdue_time firstrepaid_time 0 18:22:40 17:22:32 16:06:48 00:00:00 15:51:43 1 18:39:07 17:38:53 14:44:49 00:00:00 00:00:00 2 10:56:25 09:56:19 22:18:56 00:00:00 22:03:47 3 18:25:55 17:25:42 01:35:52 00:00:00 00:48:43 4 09:29:57 08:29:50 21:18:43 00:00:00 21:08:35 Dataset shape: (18183, 17) All columns: ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate', 'referredby', 'firstduedate', 'firstrepaiddate', 'approved_time', 'creation_time', 'closed_time', 'firstdue_time', 'firstrepaid_time']
df_prevloans.head()
df_prevloans
df_perf
# checking for the size of the data
df_perf.shape
(4368, 12)
# listing out all the columns for
df_perf.columns
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
'creationdate', 'loanamount', 'totaldue', 'termdays', 'referredby',
'good_bad_flag', 'approved_time', 'creation_time'],
dtype='object')# checking for the info of the perfomance dataset
df_perf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4368 entries, 0 to 4367 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 4368 non-null object 1 systemloanid 4368 non-null int64 2 loannumber 4368 non-null int64 3 approveddate 4368 non-null object 4 creationdate 4368 non-null object 5 loanamount 4368 non-null float64 6 totaldue 4368 non-null float64 7 termdays 4368 non-null int64 8 referredby 587 non-null object 9 good_bad_flag 4368 non-null object 10 approved_time 4368 non-null object 11 creation_time 4368 non-null object dtypes: float64(2), int64(3), object(7) memory usage: 409.6+ KB
# checking the number of missing values in our dataset
df_perf.isna().sum()
# checking the percentage of missing values in our dataset
(df_perf.isna().sum().sort_values(ascending=False)/len(df_perf))*100
# Handle missing referredby (too many missing to be useful)
df_perf.drop('referredby', axis=1, inplace=True)
# confirming if we dropped the referredby column
(df_perf.isna().sum().sort_values(ascending=False)/len(df_perf))*100
# Convert date columns to datetime
df_perf['approveddate'] = pd.to_datetime(df_perf['approveddate'])
df_perf['creationdate'] = pd.to_datetime(df_perf['creationdate'])
# veryfying the datatypes
df_perf.dtypes
np.int64(0)
np.int64(0)
# checking if our primary key(customerid) is unique
df_perf['customerid'].nunique()
4368
# confirming if our dataset is clean
df_perf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4368 entries, 0 to 4367 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 4368 non-null object 1 systemloanid 4368 non-null int64 2 loannumber 4368 non-null int64 3 approveddate 4368 non-null datetime64[ns] 4 creationdate 4368 non-null datetime64[ns] 5 loanamount 4368 non-null float64 6 totaldue 4368 non-null float64 7 termdays 4368 non-null int64 8 good_bad_flag 4368 non-null object 9 approved_time 4368 non-null object 10 creation_time 4368 non-null object dtypes: datetime64[ns](2), float64(2), int64(3), object(4) memory usage: 375.5+ KB
DEMOGRAPHICS DATASET
df_demo
# checking for the size of the data
df_demo.shape
(4346, 10)
# listing out all the columns for
df_demo.columns
Index(['customerid', 'birthdate', 'bank_account_type', 'longitude_gps',
'latitude_gps', 'bank_name_clients', 'bank_branch_clients',
'employment_status_clients', 'level_of_education_clients',
'birth_year'],
dtype='object')# checking Afor the info of the perfomance dataset
df_demo.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4346 entries, 0 to 4345 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 4346 non-null object 1 birthdate 4346 non-null object 2 bank_account_type 4346 non-null object 3 longitude_gps 4346 non-null float64 4 latitude_gps 4346 non-null float64 5 bank_name_clients 4346 non-null object 6 bank_branch_clients 51 non-null object 7 employment_status_clients 3698 non-null object 8 level_of_education_clients 587 non-null object 9 birth_year 4346 non-null int32 dtypes: float64(2), int32(1), object(7) memory usage: 322.7+ KB
# checking the number of missing values in our dataset
df_demo.isna().sum()
# checking the percentage of missing values in our dataset
(df_demo.isna().sum().sort_values(ascending=False)/len(df_demo))*100
# For columns with high missingness (>80%), we consider dropping
df_demo.drop(['bank_branch_clients', 'level_of_education_clients'], axis=1, inplace=True)
# lets see the unique values of employment_status_clients
df_demo.employment_status_clients.unique()
array([nan, 'Permanent', 'Student', 'Self-Employed', 'Unemployed',
'Retired', 'Contract'], dtype=object)sns.countplot(df_demo['employment_status_clients'])
From the plot, we notice that the permanent feature dominates, so therefore, filling with the most-frequent will cause a bais, hence, we will fill with unknown, meaning that 'employment_status_clients' of the 15% customers are unknown.
# converting the birthdate to datetime
df_demo['birthdate'] = pd.to_datetime(df_demo['birthdate'])
# veryfying the datatypes
df_demo.dtypes
# checking for duplicates
df_demo.duplicated().sum()
np.int64(12)
np.int64(12)
# checking if our primary key(customerid) is unique
df_demo['customerid'].nunique()
4334
df_demo.shape
(4346, 8)
We Notice that our primary key here is not unique and has duplicates, so lets drop duplicates
# dropping the duplicates
df_demo = df_demo.drop_duplicates()
# confirming if duplicates was dropped
df_demo.duplicated().sum()
np.int64(0)
# checking the size of the dataset
df_demo.shape
(4334, 8)
The primary key is now unique
<class 'pandas.core.frame.DataFrame'> Index: 4334 entries, 0 to 4345 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 4334 non-null object 1 birthdate 4334 non-null datetime64[ns] 2 bank_account_type 4334 non-null object 3 longitude_gps 4334 non-null float64 4 latitude_gps 4334 non-null float64 5 bank_name_clients 4334 non-null object 6 employment_status_clients 4334 non-null object 7 birth_year 4334 non-null int32 dtypes: datetime64[ns](1), float64(2), int32(1), object(4) memory usage: 287.8+ KB
Previous Loan Dataset
# importing the previous loand dataset
df_prevloans = pd.read_csv(url3)
df_prevloans.head()
df_prevloans
# size of the dataset
df_prevloans.shape
(18183, 12)
# columns of the dataset
df_prevloans.columns
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate',
'referredby', 'firstduedate', 'firstrepaiddate'],
dtype='object')# checking the number of missing values in our dataset
df_prevloans.isna().sum()
# Handle missing referredby (too many missing to be useful)
df_prevloans.drop('referredby', axis=1, inplace=True)
# confirming that we don't have missing values anymore
(df_prevloans.isna().sum().sort_values(ascending=False)/len(df_prevloans))*100
# Convert all date columns
date_cols = ['approveddate', 'creationdate', 'closeddate', 'firstduedate', 'firstrepaiddate']
for col in date_cols:
df_prevloans[col] = pd.to_datetime(df_prevloans[col])
df_prevloans.dtypes
# checking for duplicates
df_prevloans.duplicated().sum()
np.int64(0)
# checking for duplicates in the primary key
df_prevloans['customerid'].duplicated().sum()
np.int64(13824)
# checking for number of unique customers
df_prevloans['customerid'].nunique()
4359
# checking the size of the dataset
df_prevloans.shape
(18183, 11)
Now Observe that in our customers previous loan dataset, the customerid is not unique because out of the 18,183 customers, only 4,359 custormerid is unique while having a duplicate of about 13,824(which implies that a customer have many past loans) which contradicts our aim of utilizing customer's behavior and financial data to build a predictive model, enhancing accuracy and efficiency in risk assessment as this will cause a data misalignment if not treated. In other not to contradict the aim of this project, I then need to find a way to make sure that the customerid unique, i.e., making it one customer to one loan(one row one customer). By doing this we are able to utilize the customers behavior and financial data effectively to predict our model and gain proper or efficient risk assessment.
To be able to make our customerid unique in our customers previous loan dataset, We need to "Aggregate" df_prevloans to one row per customer(e.g,. number of loans, total loan amount) so as to enable us merge with both the customers performance dataset and the customers demographics dataset so as to avoid misalignment and also deviating from our aim.
In course of trying to aggregate, i will create some new features with th already existing feature and then aggregate
# Feature Creation for the Previous Loan dataset
# Repayment Ratio
df_prevloans['repayment_ratio'] = df_prevloans['totaldue'] / df_prevloans['loanamount']
# Loan duration in days (closed - approved)
df_prevloans['duration_days'] = (df_prevloans['closeddate'] - df_prevloans['approveddate']).dt.days
# Time to first repayment
df_prevloans['repay_delay_days'] = (df_prevloans['firstrepaiddate'] - df_prevloans['firstduedate']).dt.days
# Flag for first repayment delay - 1 if payment was late, 0 if on-time/early
df_prevloans['firstrepaid_late'] = (df_prevloans['repay_delay_days'] > 0).astype(int)
# Closure or Settlement delay
df_prevloans['closed_late'] = (df_prevloans['duration_days'] > df_prevloans['termdays']).astype(int)
# interest amount
df_prevloans['interest'] = df_prevloans['totaldue'] - df_prevloans['loanamount']
# cheking the dataset
df_prevloans.head()
df_prevloans
# aggregating the new features on the customerid using groupby
df_prevloans_agg = df_prevloans.groupby('customerid').agg({
'systemloanid': 'count', # number of past loans
'loanamount': 'mean',
'repay_delay_days': 'mean',
'firstrepaid_late': 'sum', # total late first repayments
'closed_late': 'sum', # total late closures
'repayment_ratio': 'mean',
'duration_days': 'mean',
'interest': 'mean'
}).reset_index()
# renameing the columns of the new previous loan dataset
df_prevloans_agg.columns = ['customerid', 'num_prev_loans', 'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late','avg_prev_repayment_ratio',
'avg_duration_days', 'avg_prev_interest']
After creating the new features, I grouped all rows that belong to the same customer, then aggregate their past loans into a single record per customer which now makes our customer previous loan unique and ready to be merged.
Before I proceed, i would love to give a little description of my aggregated table:
df_prevloans_agg| Column Name | Description |
|---|---|
customerid |
Unique identifier for the customer. |
num_prev_loans |
Total number of previous loans the customer has taken. |
avg_prev_loanamt |
Average loan amount from previous loans. |
max_prev_loanamt |
Maximum loan amount from previous loans. |
min_prev_loanamt |
Minimum loan amount from previous loans. |
total_firstrepaid_late |
Total count of previous loans where the first repayment was made after the due date. |
total_closed_late |
Total count of previous loans that were closed later than the planned term. |
avg_prev_repayment_ratio |
Average ratio of total amount repaid to the original loan amount for previous loans. |
avg_duration_days |
Average number of days from loan approval to loan closure for previous loans. |
avg_prev_interest_rate |
Average interest rate of previous loans, calculated as interest per loan amount per term. |
avg_prev_interest |
Average interest amount paid for previous loans. |
# viewing the first 5
df_prevloans_agg.head()
df_prevloans_agg
# checking the size
df_prevloans_agg.shape
(4359, 9)
4359
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4359 entries, 0 to 4358 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 4359 non-null object 1 num_prev_loans 4359 non-null int64 2 avg_prev_loanamt 4359 non-null float64 3 avg_repay_delay_days 4359 non-null float64 4 total_firstrepaid_late 4359 non-null int64 5 total_closed_late 4359 non-null int64 6 avg_prev_repayment_ratio 4359 non-null float64 7 avg_duration_days 4359 non-null float64 8 avg_prev_interest 4359 non-null float64 dtypes: float64(5), int64(3), object(1) memory usage: 306.6+ KB
np.int64(0)
(4368, 11)
(4334, 8)
(4359, 9)
Customers in perf not found in demo: 1099
customerid systemloanid loannumber approveddate \ 5 8a8589f35451855401546b0738c42524 301986516 8 2017-07-19 29 8a858ee55830c4b90158337542ab18a1 301972649 6 2017-07-11 34 8a858f1955b1c4df0155cd14c5b478ed 302000569 2 2017-07-28 38 8a858f3d5add42e2015ae0ca6cb66b83 301998400 8 2017-07-27 40 8a858f4f5511dca201551b73634170b6 301997295 4 2017-07-26 creationdate loanamount totaldue termdays good_bad_flag approved_time \ 5 2017-07-19 30000.00 39000.00 60 Good 21:46:24 29 2017-07-11 10000.00 13000.00 30 Bad 09:28:30 34 2017-07-28 10000.00 13000.00 30 Good 13:17:13 38 2017-07-27 30000.00 34500.00 30 Good 09:34:11 40 2017-07-26 10000.00 13000.00 30 Good 16:15:25 creation_time 5 20:46:18 29 08:27:20 34 12:16:00 38 08:34:05 40 15:15:18
Now i will merge
# viewing the size
df_main.shape
(3269, 18)
What this tells us is that, only 3,269 Customers fully has their Demographic informations fully filled meaning that they fully applied for a loanwhile 1099 didn't have their demographic meaning that they either didn't apply or they never intended to apply.
Now, before I merge the already merged performance and the demographic with the previous loan, I want to see how many customers that have not collected loan in the past, thereby making them not to have historical data. They can be called new loanees.
# number of customers without a loan history
set_main = set(df_main['customerid'])
set_prevloans_agg = set(df_prevloans_agg['customerid'])
missing_customers = set_main - set_prevloans_agg
print(f"Customers in main not found in prevloans_agg: {len(missing_customers)}")
Customers in main not found in prevloans_agg: 5
So we have 5 customers not having a historical data and therefore do not have previous loans
# let us see the customers
missing_ids = df_main[df_main['customerid'].isin(missing_customers)]
print(missing_ids.head())
customerid systemloanid loannumber approveddate \
1 8a85886e54beabf90154c0a29ae757c0 301965204 2 2017-07-05
229 8a76e7d443e6e97c0143ed0a13cb4f61 301999706 4 2017-07-28
1675 8a858e4357be1daf0157c96f4c915ef0 302001005 2 2017-07-28
2201 8a858fda56562f8f01565f928f516cea 301998904 2 2017-07-27
2872 8a858e69566ae5b801567ac352d84477 301992704 2 2017-07-24
creationdate loanamount totaldue termdays good_bad_flag approved_time \
1 2017-07-05 15000.00 17250.00 30 Good 17:04:41
229 2017-07-27 30000.00 39000.00 60 Bad 00:12:31
1675 2017-07-28 10000.00 11500.00 15 Good 17:15:11
2201 2017-07-27 10000.00 13000.00 30 Bad 14:38:40
2872 2017-07-24 10000.00 13000.00 30 Good 05:32:11
creation_time birthdate bank_account_type longitude_gps latitude_gps \
1 16:04:18 1985-08-23 Savings 3.89 7.32
229 23:11:19 1974-02-23 Savings 3.32 6.61
1675 16:15:03 1981-01-22 Savings 5.23 7.60
2201 13:37:26 1969-12-29 Savings 3.39 6.46
2872 04:31:53 1981-05-14 Savings 3.37 7.12
bank_name_clients employment_status_clients birth_year
1 GT Bank Permanent 1985
229 GT Bank Permanent 1974
1675 Diamond Bank Permanent 1981
2201 UBA Permanent 1969
2872 Wema Bank Permanent 1981
On merging, we have:
# merging of all three datasets
df_final = pd.merge(df_main, df_prevloans_agg, on='customerid', how='left')
# checkng the size of the dataset
df_final.shape
(3269, 26)
# checking the informatiion of the dataset
df_final.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3269 entries, 0 to 3268 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerid 3269 non-null object 1 systemloanid 3269 non-null int64 2 loannumber 3269 non-null int64 3 approveddate 3269 non-null datetime64[ns] 4 creationdate 3269 non-null datetime64[ns] 5 loanamount 3269 non-null float64 6 totaldue 3269 non-null float64 7 termdays 3269 non-null int64 8 good_bad_flag 3269 non-null object 9 approved_time 3269 non-null object 10 creation_time 3269 non-null object 11 birthdate 3269 non-null datetime64[ns] 12 bank_account_type 3269 non-null object 13 longitude_gps 3269 non-null float64 14 latitude_gps 3269 non-null float64 15 bank_name_clients 3269 non-null object 16 employment_status_clients 3269 non-null object 17 birth_year 3269 non-null int32 18 num_prev_loans 3264 non-null float64 19 avg_prev_loanamt 3264 non-null float64 20 avg_repay_delay_days 3264 non-null float64 21 total_firstrepaid_late 3264 non-null float64 22 total_closed_late 3264 non-null float64 23 avg_prev_repayment_ratio 3264 non-null float64 24 avg_duration_days 3264 non-null float64 25 avg_prev_interest 3264 non-null float64 dtypes: datetime64[ns](3), float64(12), int32(1), int64(3), object(7) memory usage: 651.4+ KB
As expected, we had missing values in the prevloans_agg dataset that was merged, and that was simply because we used a left join and left join attached the dataset to the left side of the former. And that was why we noticed that we had 5 customers that didn't have any loan history but are new loanees. Now lets check for the missing value and deal with it
# checking for number missing values
df_final.isna().sum()
Now to deal with this missing value problem, I will fill with default values such as:
# filling missing value with 0
df_final.fillna({
'avg_prev_interest_rate': 0,
'total_firstrepaid_late': 0,
'total_closed_late': 0,
'avg_prev_repayment_ratio': 0, # -1 means "no repayment history"
'avg_duration_days': 0, # 0 means no lateness - or no history
'num_prev_loans': 0,
'avg_repay_delay_days': 0,
'avg_prev_loanamt': 0,
'avg_prev_interest': 0
}, inplace=True)
# checking for percentage missing values
(df_final.isna().sum().sort_values(ascending=False)/len(df_final))*100
(3269, 26)
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
'approved_time', 'creation_time', 'birthdate', 'bank_account_type',
'longitude_gps', 'latitude_gps', 'bank_name_clients',
'employment_status_clients', 'birth_year', 'num_prev_loans',
'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late',
'total_closed_late', 'avg_prev_repayment_ratio', 'avg_duration_days',
'avg_prev_interest'],
dtype='object')At this stage, I will utilize the latitude and longitude coordinates in the dataset to determine the geographic locations of all data points. Rather than restricting the analysis to a specific country, I will consider all points irrespective of their geographic boundaries. This approach involves mapping each coordinate globally and extracting the corresponding place names (such as city, state, and country) to enrich the dataset with meaningful location information. This will enhance the spatial analysis and visualization by providing clear, contextual geographic references for all data entries.
Plotting the various locations on a world is seen below:
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
'approved_time', 'creation_time', 'birthdate', 'bank_account_type',
'longitude_gps', 'latitude_gps', 'bank_name_clients',
'employment_status_clients', 'birth_year', 'num_prev_loans',
'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late',
'total_closed_late', 'avg_prev_repayment_ratio', 'avg_duration_days',
'avg_prev_interest'],
dtype='object')I will move forward to engineering few features from the total merged datasets of the customer's performance, demographic and previous loan dataset to aid our prediction using our customers behavior and finanacial data.
First, I will engineer the age using the creation date and the birthdate to get the various customers age after which i will then drop the birthdate as it will no longer be useful for our analysis and prediction. Next, I will engineer the age-group of the various customers to distiguish between the young adults, adults, middle-aged adults and old adults to discover the age group that defaults the most and so on.
array([45, 31, 32, 39, 30, 28, 29, 34, 52, 40, 38, 35, 41, 23, 24, 33, 26,
36, 42, 55, 46, 47, 37, 25, 21, 43, 22, 50, 27, 51, 44, 49, 48, 53,
54])Minimum age: 21 Maximum age: 55
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
'approved_time', 'creation_time', 'bank_account_type', 'longitude_gps',
'latitude_gps', 'bank_name_clients', 'employment_status_clients',
'birth_year', 'num_prev_loans', 'avg_prev_loanamt',
'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late',
'avg_prev_repayment_ratio', 'avg_duration_days', 'avg_prev_interest',
'age'],
dtype='object')(3269, 26)
### EXPLORATORY DATA ANALYSIS OF THE MERGED DATASET df_loan
This merged dataset provides a complete view of each customer’s loan history, personal details, and current loan performance, enabling us to uncover patterns and relationships that may influence loan repayment behavior.
With the data fully cleaned and prepared, we can now proceed to explore it, analyze distributions, detect trends, and identify potential predictors for our risk assessment model.
(3269, 26)
Even though the merged dataset is now clean and ready for modeling, I still want to perform further checks and exploratory analysis — because you never know what might have slipped through during data processing. It’s always a good idea to double-check for anomalies, outliers, or unexpected patterns before moving forward.
np.int64(0)
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
'approved_time', 'creation_time', 'bank_account_type', 'longitude_gps',
'latitude_gps', 'bank_name_clients', 'employment_status_clients',
'birth_year', 'num_prev_loans', 'avg_prev_loanamt',
'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late',
'avg_prev_repayment_ratio', 'avg_duration_days', 'avg_prev_interest',
'age'],
dtype='object')After merging and creating the new features, I now have a combined dataset (df_final) that contains customer performance, demographics, and aggregated previous loan information.
Before moving forward, here’s a quick description of the columns in the dataset:
df_final| Column Name | Description |
|---|---|
customerid |
Unique identifier for each customer. |
systemloanid |
Unique identifier for each loan transaction in the system. |
loannumber |
Sequential number representing the loan order for a customer. |
approveddate |
Date the loan was approved. |
creationdate |
Date the loan record was created in the system. |
loanamount |
Amount of money borrowed for the current loan. |
totaldue |
Total amount to be repaid (principal + interest). |
termdays |
Duration of the loan in days. |
good_bad_flag |
Loan performance indicator (e.g., Binary target: 0 = Bad (defaulted), 1 = Good (did not default) |
interest_curr_amount |
Interest amount charged on the current loan. |
interest_curr_rate |
Interest rate applied to the current loan. |
repayment_curr_ratio |
Ratio of amount repaid to total due for the current loan. |
bank_account_type |
Type of bank account the customer holds (e.g., savings, checking). |
longitude_gps |
Longitude coordinate of the customer’s recorded location. |
latitude_gps |
Latitude coordinate of the customer’s recorded location. |
bank_name_clients |
Name of the bank where the customer holds an account. |
employment_status_clients |
Employment status of the customer. |
num_prev_loans |
Total number of previous loans taken by the customer. |
avg_prev_loanamt |
Average loan amount from previous loans. |
total_firstrepaid_late |
Number of previous loans where the first repayment was late. |
total_closed_late |
Number of previous loans closed later than the agreed term. |
avg_prev_repayment_ratio |
Average repayment ratio for previous loans. |
avg_duration_days |
Average duration (in days) of previous loans. |
avg_prev_interest |
Average interest amount paid in previous loans. |
age |
Customer’s age in years. |
After veryfying that our datset is clean, we move now to visualizing each column in our dataset for analysis. To do this I will split them into numerical, categorical and datetime as we have from our available columns.
I am going to drop some features as I believe that they will not be useful for our prediction and may be redundant and create noise.
Below is a summary of the columns we removed from our dataset before training the predictive model, along with the reasons for their removal:
| Column Name | Reason for Removal |
|---|---|
customerid |
This is a unique identifier for each customer. It does not have any relationship with loan repayment behavior and provides no predictive signal. |
systemloanid |
This is a unique identifier for each loan transaction. Like customerid, it is purely administrative and carries no predictive value. |
approveddate |
While dates can be useful when transformed into features like “month” or “day of week,” the raw approval date itself is too granular and will not generalize well in predictions. Instead, derived features are preferred. |
creationdate |
Same reasoning as approveddate — raw date values have limited predictive value, but derived temporal patterns may be useful. |
longitude_gps |
Raw GPS coordinates are too detailed to be directly useful |
latitude_gps |
Raw GPS coordinates are too detailed to be directly useful |
loannumber |
Sequential loan count per customer; dropped to prevent data leakage and redundancy with num_prev_loans |
bank_name_clients |
It acst as an identifier and usually doesn't provide meaningful predictive information for loan default risk |
Before I visualize, I will drop them.
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3269 entries, 0 to 3268 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 loanamount 3269 non-null float64 1 totaldue 3269 non-null float64 2 termdays 3269 non-null int64 3 good_bad_flag 3269 non-null object 4 approved_time 3269 non-null object 5 creation_time 3269 non-null object 6 bank_account_type 3269 non-null object 7 employment_status_clients 3269 non-null object 8 birth_year 3269 non-null int32 9 num_prev_loans 3269 non-null float64 10 avg_prev_loanamt 3269 non-null float64 11 avg_repay_delay_days 3269 non-null float64 12 total_firstrepaid_late 3269 non-null float64 13 total_closed_late 3269 non-null float64 14 avg_prev_repayment_ratio 3269 non-null float64 15 avg_duration_days 3269 non-null float64 16 avg_prev_interest 3269 non-null float64 17 age 3269 non-null int64 18 interest_curr_amount 3269 non-null float64 19 interest_curr_rate 3269 non-null float64 20 repayment_curr_ratio 3269 non-null float64 21 repayment_efficiency 3269 non-null float64 22 late_payment_rate 3269 non-null float64 dtypes: float64(15), int32(1), int64(2), object(5) memory usage: 574.8+ KB
['loanamount', 'totaldue', 'termdays', 'birth_year', 'num_prev_loans', 'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late', 'avg_prev_repayment_ratio', 'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_amount', 'interest_curr_rate', 'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate']
['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients']
# Visualizing the distribution of 2 categorical columns
fig, axes = plt.subplots(1, 2, figsize=(15, 6)) # 1 row, 2 columns
for i, col in enumerate(cat_cols[:2]): # pick only the first 2 categorical columns
sns.countplot(ax=axes[i], data=df_final, x=col)
axes[i].set_title(f'Distribution of {col}')
axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
THE DESCRIPTIVE STATISTICS AND CORRELATION MATRIX
# descriptive statistics
display(df_final.describe().T.round(2))
# correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df_final[num_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()
THE TARGET COLUMN
Next, I move to visualizing the target column, i.,e, good_bad_flag which is the Loan performance indicator (e.g., Binary target: 0 = Bad (defaulted), 1 = Good (did not default)).
# Target Column
df_final.good_bad_flag.value_counts().plot(kind='bar')
THE DESCRIPTIVE STATISTICS AND CORRELATION MATRIX
# unique value of the target column
df_final.good_bad_flag.unique()
array(['Good', 'Bad'], dtype=object)
array([1, 0])
# Target Column
df_final.good_bad_flag.value_counts().plot(kind='bar')
num_cols
['loanamount', 'totaldue', 'termdays', 'birth_year', 'num_prev_loans', 'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late', 'avg_prev_repayment_ratio', 'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_amount', 'interest_curr_rate', 'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate']
# checking correlation matrix
heatmap = df_final[num_col].corr()
plt.figure(figsize=(20, 10))
sns.heatmap(data=heatmap, fmt=".2f", annot=True, cmap="coolwarm")
plt.title("Correlation Matrix", fontweight="bold")
plt.show()
late_payment_rate: -0.28 (best predictor)avg_repay_delay_days: -0.23 (second best)loanamount: 0.12repayment_curr_ratio: 0.12 total_firstrepaid_late: -0.14termdays: 0.02age: 0.06repayment_efficiency: -0.02Remove these features immediately:
totaldue (r=0.99 with loanamount)interest_curr_amount (r=0.88 with termdays)avg_prev_loanamt (r=0.85 with num_prev_loans)total_closed_late (r=0.88 with total_firstrepaid_late)df_final
num_col = ['loanamount', 'good_bad_flag',
'termdays',
'repayment_curr_ratio',
'num_prev_loans',
'avg_repay_delay_days',
'total_firstrepaid_late',
'avg_prev_repayment_ratio',
'avg_duration_days',
'avg_prev_interest',
'age',
'repayment_efficiency',
'late_payment_rate']
Start coding or generate with AI.
Start coding or generate with AI.
# Data Preparation
# viewing the columns
df_final.columns
Index(['loanamount', 'termdays', 'good_bad_flag', 'approved_time',
'creation_time', 'bank_account_type', 'employment_status_clients',
'birth_year', 'num_prev_loans', 'avg_repay_delay_days',
'total_firstrepaid_late', 'avg_prev_repayment_ratio',
'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_rate',
'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate'],
dtype='object')# separate the features and target
X = df_final.drop(columns='good_bad_flag')
y = df_final['good_bad_flag']
# separate into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
Start coding or generate with AI.
# split into cat and num cols
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()
['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients']
['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients']
metrics_df
metrics_df
Start coding or generate with AI.
Start coding or generate with AI.
results = {}
fig, axes = plt.subplots(2, 3, figsize=(16, 4))
for (name, model), ax in zip(models.items(), axes.flatten()):
# Pipeline with SMOTE balancing
pipeline = Pipeline([
('preprocessor', preprocessor), # handles encoding/scaling
('smote', SMOTE(random_state=42)), # balances the classes
('var_thresh', VarianceThreshold(threshold=0.01)), # removes low-variance features
('classifier', model) # the model itself
])
# Fit on raw training data (SMOTE applied internally to training data only)
pipeline.fit(X_train, y_train)
# Predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)
# Determine if binary or multi-class
n_classes = len(np.unique(y_train))
is_binary = n_classes == 2
# Compute metrics (simplified for binary classification)
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
# ROC AUC (simplified for binary classification)
if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
test_pred_proba = pipeline.predict_proba(X_test)[:, 1] # positive class probability
roc_auc = roc_auc_score(y_test, test_pred_proba)
else:
roc_auc = None
results[name] = {
"Train Accuracy": train_acc,
"Test Accuracy": test_acc,
"Precision Score": precision,
"Recall Score": recall,
"F1 Score": f1,
"ROC AUC": roc_auc
}
# Printing classification report for each model
print(f"\nDetailed Classification Report for {name}:")
print("=" * 60)
print(classification_report(y_test, test_pred, target_names=['0 (Default)', '1 (No Default)']))
# Confusion matrix
cm = confusion_matrix(y_test, test_pred) # y_true first, y_pred second
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
disp.plot(ax=ax, cmap='Blues')
ax.set_title(f'{name}\nF1: {f1:.3f}')
plt.tight_layout()
plt.show()
# Display results
metrics_df = pd.DataFrame(results).T # Transpose for models as rows
print("\nModel Performance (After SMOTE Balancing):")
print("=" * 55)
print(metrics_df.round(3))
# Highlight best performing models
print("\nBest Models by Metric:")
print("=" * 30)
for metric in metrics_df.columns:
if metric != 'ROC AUC' or metrics_df[metric].notna().any():
best_model = metrics_df[metric].idxmax()
best_score = metrics_df.loc[best_model, metric]
print(f"{metric:<15}: {best_model:<20} ({best_score:.3f})")
# Show class distribution analysis
print(f"\nOriginal Class Distribution:")
print("=" * 35)
unique, counts = np.unique(y_train, return_counts=True)
for class_label, count in zip(unique, counts):
percentage = (count / len(y_train)) * 100
print(f"Class {class_label}: {count:>6} samples ({percentage:>5.1f}%)")
imbalance_ratio = max(counts) / min(counts)
print(f"\nOriginal Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("✅ SMOTE applied - classes are now balanced in training data")
# Preprocess first
X_train_processed = preprocessor.fit_transform(X_train)
# Apply SMOTE
X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(X_train_processed, y_train)
# Check balance
print(y_resampled.value_counts())
good_bad_flag 1 2045 0 2045 Name: count, dtype: int64
# Preprocess first
X_test_processed = preprocessor.fit_transform(X_test)
# Apply SMOTE
X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(X_test_processed, y_test)
# Check balance
print(y_resampled.value_counts())
good_bad_flag 0 511 1 511 Name: count, dtype: int64
metrics_df
metrics_df
df_final.columns
Index(['loanamount', 'termdays', 'good_bad_flag', 'approved_time',
'creation_time', 'bank_account_type', 'employment_status_clients',
'birth_year', 'num_prev_loans', 'avg_repay_delay_days',
'total_firstrepaid_late', 'avg_prev_repayment_ratio',
'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_rate',
'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate'],
dtype='object')df_final['sqrt_late_payment_rate'] = np.sqrt(df_final['late_payment_rate'])
df_final['sqrt_termdays'] = np.sqrt(df_final['termdays'])
df_final['sqrt_loanamount'] = np.sqrt(df_final['loanamount'])
df_final['sqrt_avg_prev_interest'] = np.sqrt(df_final['avg_prev_interest'])
df_final['sqrt_repayment_efficiency'] = np.sqrt(df_final['repayment_efficiency'])
target_columns = ['sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency']
fig, axes = plt.subplots(3, 2, figsize=(15, 8)) # 3x2 grid
axes = axes.flatten() # flatten into 1D array for easy looping
for i, col in enumerate(target_columns):
sns.histplot(data=df_final, x=col, bins=30, kde=True, edgecolor='black', alpha=0.7, ax=axes[i])
axes[i].set_title(f"Distribution of {col}")
# Hide the unused last subplot
fig.delaxes(axes[-1])
plt.tight_layout()
plt.show()
target_columns = ['sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency']
fig, axes = plt.subplots(3, 2, figsize=(15, 8)) # 3x2 grid
axes = axes.flatten() # flatten into 1D array for easy looping
for i, col in enumerate(target_columns):
sns.boxplot(data=df_final, x=col, ax=axes[i])
axes[i].set_title(f"Distribution of {col}")
# Hide the unused last subplot
fig.delaxes(axes[-1])
plt.tight_layout()
plt.show()
# Cap extreme values
df_final['repayment_efficiency'] = df_final['repayment_efficiency'].clip(upper=df_final['repayment_efficiency'].quantile(0.99))
sns.boxplot(df_final['repayment_efficiency'])
target_columns = ['sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency', 'good_bad_flag']
# checking correlation matrix
heatmap = df_final[target_columns].corr()
plt.figure(figsize=(20, 10))
sns.heatmap(data=heatmap, fmt=".2f", annot=True, cmap="coolwarm")
plt.title("Correlation Matrix", fontweight="bold")
plt.show()
df_final.columns
Index(['loanamount', 'termdays', 'good_bad_flag', 'approved_time',
'creation_time', 'bank_account_type', 'employment_status_clients',
'birth_year', 'num_prev_loans', 'avg_repay_delay_days',
'total_firstrepaid_late', 'avg_prev_repayment_ratio',
'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_rate',
'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate',
'sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount',
'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency'],
dtype='object')# prediction
X = df_final.drop(columns=['late_payment_rate', 'termdays', 'good_bad_flag', 'loanamount', 'avg_prev_interest', 'repayment_efficiency'])
y = df_final["good_bad_flag"]
# separate into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
# split into cat and num cols
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols
['birth_year', 'num_prev_loans', 'avg_repay_delay_days', 'total_firstrepaid_late', 'avg_prev_repayment_ratio', 'avg_duration_days', 'age', 'interest_curr_rate', 'repayment_curr_ratio', 'sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency']
cat_cols
['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients']
# COLUMN TRANSFORMER PIPELINE
# Numeric transformer
num_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
# Cstegorical transformer
cat_transformer = Pipeline(steps=[
('encoder', OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
# Column transformer
preprocessor = ColumnTransformer(transformers=[
('scaled_num', num_transformer, num_cols),
('encoded_cat', cat_transformer, cat_cols),
])
# define our model
models = {
'Logistic Regression' : LogisticRegression(),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(random_state=42),
'XGBoost' : XGBClassifier(random_state=42),
"Gradient Boost": GradientBoostingClassifier(random_state=42),
"Cat Boost" : CatBoostClassifier(random_seed=42, verbose=0)
}
results = {}
fig, axes = plt.subplots(2, 3, figsize=(16, 4))
for (name, model), ax in zip(models.items(), axes.flatten()):
# Pipeline with SMOTE balancing
pipeline = Pipeline([
('preprocessor', preprocessor), # handles encoding/scaling
('smote', SMOTE(random_state=42)), # balances the classes
('var_thresh', VarianceThreshold(threshold=0.01)), # removes low-variance features
('classifier', model) # the model itself
])
# Fit on raw training data (SMOTE applied internally to training data only)
pipeline.fit(X_train, y_train)
# Predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)
# Determine if binary or multi-class
n_classes = len(np.unique(y_train))
is_binary = n_classes == 2
# Compute metrics (simplified for binary classification)
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
# ROC AUC (simplified for binary classification)
if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
test_pred_proba = pipeline.predict_proba(X_test)[:, 1] # positive class probability
roc_auc = roc_auc_score(y_test, test_pred_proba)
else:
roc_auc = None
results[name] = {
"Train Accuracy": train_acc,
"Test Accuracy": test_acc,
"Precision Score": precision,
"Recall Score": recall,
"F1 Score": f1,
"ROC AUC": roc_auc
}
# Printing classification report for each model
print(f"\nDetailed Classification Report for {name}:")
print("=" * 60)
print(classification_report(y_test, test_pred, target_names=['0 (Default)', '1 (No Default)']))
# Confusion matrix
cm = confusion_matrix(y_test, test_pred) # y_true first, y_pred second
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
disp.plot(ax=ax, cmap='Blues')
ax.set_title(f'{name}\nF1: {f1:.3f}')
plt.tight_layout()
plt.show()
# Display results
metrics_df = pd.DataFrame(results).T # Transpose for models as rows
print("\nModel Performance (After SMOTE Balancing):")
print("=" * 55)
print(metrics_df.round(3))
# Highlight best performing models
print("\nBest Models by Metric:")
print("=" * 30)
for metric in metrics_df.columns:
if metric != 'ROC AUC' or metrics_df[metric].notna().any():
best_model = metrics_df[metric].idxmax()
best_score = metrics_df.loc[best_model, metric]
print(f"{metric:<15}: {best_model:<20} ({best_score:.3f})")
# Show class distribution analysis
print(f"\nOriginal Class Distribution:")
print("=" * 35)
unique, counts = np.unique(y_train, return_counts=True)
for class_label, count in zip(unique, counts):
percentage = (count / len(y_train)) * 100
print(f"Class {class_label}: {count:>6} samples ({percentage:>5.1f}%)")
imbalance_ratio = max(counts) / min(counts)
print(f"\nOriginal Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("✅ SMOTE applied - classes are now balanced in training data")
from sklearn.metrics import roc_curve, auc
# ROC Curve Plot
plt.figure(figsize=(10, 8))
colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown']
for (name, model), color in zip(models.items(), colors):
pipeline = Pipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('var_thresh', VarianceThreshold(threshold=0.01)),
('classifier', model)
])
pipeline.fit(X_train, y_train)
if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color=color, lw=2,
label=f'{name} (AUC = {roc_auc:.3f})')
# Reference line
plt.plot([0, 1], [0, 1], 'k--', alpha=0.7, label='Random (AUC = 0.500)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Start coding or generate with AI.
Start coding or generate with AI.
# Initialize results dictionary
results = {}
# Model name
name = "GradientBoostClassifier"
# Final model
final_model = GradientBoostingClassifier(random_state=42)
# Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('var_thresh', VarianceThreshold(threshold=0.01)),
('classifier', final_model)
])
# Fit the pipeline
pipeline.fit(X_train, y_train)
# Predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)
# Metrics
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
test_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, test_pred_proba)
else:
roc_auc = None
# Save results
results[name] = {
"Train Accuracy": train_acc,
"Test Accuracy": test_acc,
"Precision Score": precision,
"Recall Score": recall,
"F1 Score": f1,
"ROC AUC": roc_auc
}
# === PRINT RESULTS ===
print(f"\n📊 Results for {name}:")
for metric, value in results[name].items():
if value is not None:
print(f"{metric}: {value:.4f}")
else:
print(f"{metric}: None")
# === TABLE VIEW OF RESULTS ===
df_results = pd.DataFrame(results).T # Models as rows
print("\n📋 All Results Table:")
print(df_results)
# === CLASSIFICATION REPORT ===
print("\n📑 Classification Report:")
print(classification_report(
y_test, test_pred,
target_names=[str(lbl) for lbl in np.unique(y_train)]
))
# === CONFUSION MATRIX PLOT ===
fig, ax = plt.subplots()
cm = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
disp.plot(ax=ax, cmap='Blues')
ax.set_title(f'{name}\nF1: {f1:.3f}')
plt.tight_layout()
plt.show()
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
models = {
'Logistic Regression': LogisticRegression(random_state=42),
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'XGBoost': XGBClassifier(random_state=42),
'Gradient Boost': GradientBoostingClassifier(random_state=42),
'Cat Boost': CatBoostClassifier(random_state=42, verbose=False)
}
# 5-fold stratified cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Stratified CV Results:")
print("=" * 70)
# Store results for ranking
results = {}
for name, model in models.items():
pipeline = Pipeline([
('preprocessor', preprocessor),
('var_thresh', VarianceThreshold(threshold=0.01)),
('smote', SMOTE(random_state=42)),
('classifier', model)
])
# Multiple metrics for comprehensive evaluation
roc_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='roc_auc', n_jobs=-1)
pr_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='average_precision', n_jobs=-1)
f1_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
# Calculate means and stability
roc_mean, roc_std = roc_scores.mean(), roc_scores.std()
pr_mean, pr_std = pr_scores.mean(), pr_scores.std()
f1_mean, f1_std = f1_scores.mean(), f1_scores.std()
stability = "Stable" if roc_std < 0.05 else "Unstable"
# Store all metrics
results[name] = {
'roc_mean': roc_mean, 'roc_std': roc_std,
'pr_mean': pr_mean, 'pr_std': pr_std,
'f1_mean': f1_mean, 'f1_std': f1_std
}
print(f"{name:<20}: ROC={roc_mean:.3f}(±{roc_std:.3f}) | PR-AUC={pr_mean:.3f}(±{pr_std:.3f}) | F1={f1_mean:.3f}(±{f1_std:.3f}) [{stability}]")
# Model selection recommendations
print("\n" + "=" * 70)
print("MODEL SELECTION RECOMMENDATIONS:")
print("-" * 40)
# Rank by PR-AUC (best for imbalanced data) with ROC as tie-breaker
sorted_by_pr = sorted(results.items(), key=lambda x: (x[1]['pr_mean'], x[1]['roc_mean']), reverse=True)
sorted_by_roc = sorted(results.items(), key=lambda x: x[1]['roc_mean'], reverse=True)
print("Rankings:")
print(f"🎯 Best by PR-AUC: {sorted_by_pr[0][0]} (PR={sorted_by_pr[0][1]['pr_mean']:.3f})")
print(f"🏆 Best by ROC-AUC: {sorted_by_roc[0][0]} (ROC={sorted_by_roc[0][1]['roc_mean']:.3f})")
# Overall recommendation (PR-AUC primary for imbalanced data)
best_model = sorted_by_pr[0][0]
best_pr_score = sorted_by_pr[0][1]['pr_mean']
print(f"✅ RECOMMENDED: {best_model} (Focus on PR-AUC for imbalanced data)")
# Top candidates for hyperparameter tuning
tuning_candidates = [name for name, scores in sorted_by_pr
if scores['pr_mean'] >= best_pr_score - 0.02]
print(f"🔧 Tune these models: {', '.join(tuning_candidates)}")
print(f"📈 Expected PR-AUC after tuning: {best_pr_score + 0.02:.3f} - {best_pr_score + 0.05:.3f}")
# Save for next steps
best_models = dict(sorted_by_pr[:3])
print(f"💾 Top 3 saved for further analysis: {list(best_models.keys())}")
print("\n💡 TIP: PR-AUC is better than ROC-AUC for imbalanced datasets!")
Stratified CV Results: ====================================================================== Logistic Regression : ROC=0.698(±0.025) | PR-AUC=0.877(±0.014) | F1=0.789(±0.008) [Stable] Decision Tree : ROC=0.572(±0.028) | PR-AUC=0.808(±0.010) | F1=0.796(±0.016) [Stable] Random Forest : ROC=0.653(±0.038) | PR-AUC=0.856(±0.019) | F1=0.832(±0.010) [Stable] XGBoost : ROC=0.655(±0.042) | PR-AUC=0.861(±0.023) | F1=0.847(±0.007) [Stable] Gradient Boost : ROC=0.678(±0.039) | PR-AUC=0.868(±0.020) | F1=0.846(±0.010) [Stable] Cat Boost : ROC=0.669(±0.049) | PR-AUC=0.865(±0.025) | F1=0.855(±0.013) [Stable] ====================================================================== MODEL SELECTION RECOMMENDATIONS: ---------------------------------------- Rankings: 🎯 Best by PR-AUC: Logistic Regression (PR=0.877) 🏆 Best by ROC-AUC: Logistic Regression (ROC=0.698) ✅ RECOMMENDED: Logistic Regression (Focus on PR-AUC for imbalanced data) 🔧 Tune these models: Logistic Regression, Gradient Boost, Cat Boost, XGBoost 📈 Expected PR-AUC after tuning: 0.897 - 0.927 💾 Top 3 saved for further analysis: ['Logistic Regression', 'Gradient Boost', 'Cat Boost'] 💡 TIP: PR-AUC is better than ROC-AUC for imbalanced datasets!
# Initialize results dictionary
results = {}
# Model name
name = "GradientBoosting"
# Final model
final_model = GradientBoostingClassifier(random_state=42)
# Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('var_thresh', VarianceThreshold(threshold=0.01)),
('classifier', final_model)
])
# Fit the pipeline
pipeline.fit(X_train, y_train)
# Predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)
# Metrics
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
test_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, test_pred_proba)
else:
roc_auc = None
# Save results
results[name] = {
"Train Accuracy": train_acc,
"Test Accuracy": test_acc,
"Precision Score": precision,
"Recall Score": recall,
"F1 Score": f1,
"ROC AUC": roc_auc
}
# === PRINT RESULTS ===
print(f"\n📊 Results for {name}:")
for metric, value in results[name].items():
if value is not None:
print(f"{metric}: {value:.4f}")
else:
print(f"{metric}: None")
# === TABLE VIEW OF RESULTS ===
df_results = pd.DataFrame(results).T # Models as rows
print("\n📋 All Results Table:")
print(df_results)
# === CLASSIFICATION REPORT ===
print("\n📑 Classification Report:")
print(classification_report(
y_test, test_pred,
target_names=[str(lbl) for lbl in np.unique(y_train)]
))
# === CONFUSION MATRIX PLOT ===
fig, ax = plt.subplots()
cm = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
disp.plot(ax=ax, cmap='Blues')
ax.set_title(f'{name}\nF1: {f1:.3f}')
plt.tight_layout()
plt.show()
Start coding or generate with AI.
Start coding or generate with AI.
# ===== 1. Fit the pipeline =====
pipeline.fit(X_train, y_train)
# ===== 2. Extract trained CatBoost model =====
model = pipeline.named_steps['classifier']
# ===== 3. Transform X_train with preprocessing only =====
X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)
# ===== 4. Get proper feature names from the preprocessor =====
# This works for most ColumnTransformer setups with OneHotEncoder
def get_feature_names(preprocessor):
feature_names = []
for name, transformer, columns in preprocessor.transformers_:
if transformer == 'drop':
continue
if hasattr(transformer, 'named_steps'): # e.g., Pipeline inside ColumnTransformer
transformer = transformer.named_steps[next(iter(transformer.named_steps))]
if hasattr(transformer, 'get_feature_names_out'):
names = list(transformer.get_feature_names_out(columns))
else:
names = list(columns)
feature_names.extend(names)
return feature_names
feature_names = get_feature_names(pipeline.named_steps['preprocessor'])
# Convert to DataFrame so SHAP sees column names
X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names)
# ===== 5. SHAP Analysis =====
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train_processed_df)
# Summary plots with real feature names
shap.summary_plot(shap_values, X_train_processed_df, plot_type='bar')
shap.summary_plot(shap_values, X_train_processed_df)
# Force plot for one sample
sample_index = 0
shap.force_plot(
explainer.expected_value,
shap_values[sample_index, :],
X_train_processed_df.iloc[sample_index, :]
)
Start coding or generate with AI.
Start coding or generate with AI.
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.pipeline import Pipeline # Use imblearn's Pipeline instead
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
import numpy as np
# Same stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("🔧 HYPERPARAMETER TUNING - GRADIENT BOOST")
print("=" * 50)
# GRADIENT BOOST TUNING
print("Tuning Gradient Boost...")
print("-" * 30)
gb_pipeline = Pipeline([
('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),
('var_thresh', VarianceThreshold(threshold=0.01)),
('classifier', GradientBoostingClassifier(random_state=42))
])
gb_params = {
'classifier__learning_rate': [0.01, 0.1, 0.2],
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 7],
'classifier__subsample': [0.8, 1.0]
}
# Use RandomizedSearchCV for faster search
gb_random = RandomizedSearchCV(
gb_pipeline, gb_params,
n_iter=20, # Try 20 combinations
cv=skf, scoring='average_precision', # PR-AUC
n_jobs=-1, verbose=1, random_state=42
)
gb_random.fit(X_train, y_train)
# Results
original_score = 0.867 # Your original GB PR-AUC
best_score = gb_random.best_score_
improvement = best_score - original_score
print(f"\n🏆 GRADIENT BOOST TUNING RESULTS:")
print("=" * 40)
print(f"✅ Best PR-AUC: {best_score:.3f} (Improvement: +{improvement:.3f})")
print(f"🎛️ Best parameters:")
for param, value in gb_random.best_params_.items():
print(f" {param}: {value}")
# Save best model
best_gb_model = gb_random.best_estimator_
print(f"\n🎯 NEXT STEP: Test tuned Gradient Boost on holdout test set!")
print("💾 Best model saved as 'best_gb_model' variable")
print(f"\n🔍 Cross-validation confidence: {best_score:.3f}")
print(f"📈 Expected test performance: {best_score - 0.02:.3f} - {best_score:.3f}")
🔧 HYPERPARAMETER TUNING - GRADIENT BOOST ================================================== Tuning Gradient Boost... ------------------------------ Fitting 5 folds for each of 20 candidates, totalling 100 fits 🏆 GRADIENT BOOST TUNING RESULTS: ======================================== ✅ Best PR-AUC: 0.878 (Improvement: +0.011) 🎛️ Best parameters: classifier__subsample: 0.8 classifier__n_estimators: 200 classifier__max_depth: 3 classifier__learning_rate: 0.01 🎯 NEXT STEP: Test tuned Gradient Boost on holdout test set! 💾 Best model saved as 'best_gb_model' variable 🔍 Cross-validation confidence: 0.878 📈 Expected test performance: 0.858 - 0.878
# ===== 1. FINAL MODEL EVALUATION ON TEST SET =====
from sklearn.metrics import (
roc_auc_score,
average_precision_score,
classification_report,
confusion_matrix,
accuracy_score,
precision_score,
recall_score,
f1_score
)
print("🔍 FINAL MODEL EVALUATION")
print("=" * 40)
# Your tuned model is stored in: best_gb_model
final_model = best_gb_model
# Test set predictions
y_test_pred = final_model.predict(X_test)
y_test_proba = final_model.predict_proba(X_test)[:, 1]
# Calculate final metrics
test_roc_auc = roc_auc_score(y_test, y_test_proba)
test_pr_auc = average_precision_score(y_test, y_test_proba)
print(f"📊 FINAL TEST SET PERFORMANCE:")
print(f" ROC-AUC: {test_roc_auc:.3f}")
print(f" PR-AUC: {test_pr_auc:.3f}")
print(f" Expected: {best_score - 0.02:.3f} - {best_score:.3f}")
# Classification report
print(f"\n📋 CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred))
# ===== 2. SAVE MODEL FOR DEPLOYMENT =====
print(f"\n💾 SAVING MODEL FOR DEPLOYMENT")
print("=" * 40)
# Save model metadata
model_info = {
'model_type': 'GradientBoostingClassifier',
'cv_score': best_score,
'test_roc_auc': test_roc_auc,
'test_pr_auc': test_pr_auc,
'best_params': gb_random.best_params_,
'features_shape': X_train.shape[1],
'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}
🔍 FINAL MODEL EVALUATION
========================================
📊 FINAL TEST SET PERFORMANCE:
ROC-AUC: 0.709
PR-AUC: 0.886
Expected: 0.858 - 0.878
📋 CLASSIFICATION REPORT:
precision recall f1-score support
0 0.42 0.50 0.46 143
1 0.85 0.81 0.83 511
accuracy 0.74 654
macro avg 0.64 0.65 0.64 654
weighted avg 0.76 0.74 0.75 654
💾 SAVING MODEL FOR DEPLOYMENT
========================================
# fit it on the whole dataset
final_model.fit(X, y)
# Save the complete trained pipeline
joblib.dump(final_model, 'gradients_boost_models.pkl')
['gradients_boost_models.pkl']
# Load our saved Model
model = joblib.load("gradients_boost_models.pkl")
model
Start coding or generate with AI.
df_final.avg_duration_days.unique()
array([ 29.45454545, 0. , 18.16666667, 31.5 ,
27. , 19.5 , 22.11111111, 17.66666667,
12. , 13. , 23. , 26.6 ,
21.5 , 28. , 25. , 20.78571429,
16.22222222, 22.5 , 9. , 12.83333333,
5. , 11.5 , 12.66666667, 21.75 ,
6. , 29.75 , 24.33333333, 29. ,
27.5 , 24. , 14. , 24.78571429,
16.66666667, 22.57142857, 24.125 , 21.2 ,
8. , 17.85714286, 26.5625 , 15.66666667,
24.18181818, 23.54545455, 15.09090909, 19.83333333,
21. , 43.28571429, 17. , 11. ,
14.6 , 28.1 , 36.09090909, 26.4 ,
19.85714286, 20.5 , 19.75 , 26. ,
26.5 , 26.375 , 16.33333333, 30. ,
22. , 14.5 , 33.625 , 22.33333333,
54. , 20.25 , 25.66666667, 19.8 ,
32. , 31. , 19.66666667, 37. ,
20. , 26.76470588, 19. , 17.18181818,
14.66666667, 16.5 , 16. , 15. ,
3. , 26.33333333, 32.5 , 25.8 ,
18. , 4. , 27.85714286, 28.5 ,
31.18181818, 19.33333333, 34.22222222, 21.8 ,
17.125 , 34. , 33. , 27.33333333,
29.33333333, 39.58333333, 27.83333333, 29.7 ,
21.6 , 22.66666667, 35.55555556, 7. ,
14.33333333, 35.88888889, 27.75 , 24.625 ,
14.25 , 17.8 , 21.33333333, 20.75 ,
20.125 , 23.5 , 24.1 , 28.2 ,
28.66666667, 20.71428571, 34.5 , 26.66666667,
44. , 36.33333333, 15.25 , 30.35714286,
31.54545455, 38. , 34.66666667, 10.33333333,
16.375 , 33.66666667, 30.55555556, 36.88888889,
29.25 , 23.77777778, 36. , 16.6 ,
24.69230769, 28.33333333, 17.4 , 26.25 ,
24.5 , 18.25 , 35.11111111, 38.66666667,
20.44444444, 34.625 , 12.75 , 8.66666667,
10.66666667, 13.6 , 17.33333333, 9.66666667,
23.33333333, 5.81818182, 27.30769231, 11.77777778,
35.8 , 33.375 , 28.42857143, 26.75 ,
10. , 36.57142857, 18.90909091, 28.6 ,
25.5 , 42. , 27.88888889, 46.25 ,
30.25 , 30.11111111, 53. , 28.71428571,
10.5 , 8.16666667, 16.4 , 22.22222222,
43. , 26.1875 , 30.33333333, 37.44444444,
25.55555556, 19.25 , 28.16666667, 41. ,
37.125 , 20.81818182, 17.42857143, 42.14285714,
27.375 , 19.54545455, 19.14285714, 30.75 ,
27.44444444, 20.4 , 10.55555556, 11.33333333,
33.22222222, 18.6 , 23.66666667, 32.66666667,
24.75 , 32.44444444, 20.66666667, 17.5 ,
22.4 , 19.81818182, 23.71428571, 25.33333333,
29.16666667, 18.83333333, 27.7 , 13.25 ,
27.77777778, 31.83333333, 18.2 , 16.875 ,
27.21428571, 16.25 , 29.5 , 25.2 ,
17.61538462, 30.57142857, 12.5 , 16.14285714,
15.57142857, 28.14285714, 9.5 , 46.5 ,
35.14285714, 22.25 , 24.66666667, 18.5 ,
39. , 27.23076923, 26.83333333, 38.33333333,
28.72727273, 10.4 , 21.25 , 29.66666667,
25.64285714, 13.33333333, 26.63636364, 33.85714286,
17.25 , 17.81818182, 25.25 , 27.8 ,
9.14285714, 27.54545455, 14.75 , 29.07692308,
34.33333333, 30.3 , 29.625 , 20.6 ,
26.8 , 32.85714286, 27.16666667, 30.72727273,
11.14285714, 56. , 37.25 , 30.83333333,
32.11111111, 18.66666667, 9.16666667, 20.625 ,
20.57142857, 17.38461538, 26.88888889, 27.66666667,
38.5 , 26.7 , 13.2 , 20.54545455,
24.16666667, 13.375 , 28.4 , 23.81818182,
17.71428571, 35.5 , 30.42857143, 13.66666667,
36.11111111, 26.14285714, 21.83333333, 18.22222222,
61.6 , 22.75 , 31.88888889, 29.6 ,
32.25 , 25.44444444, 55.66666667, 29.28571429,
11.75 , 31.375 , 17.45454545, 8.33333333,
26.875 , 25.81818182, 14.16666667, 30.22222222,
13.85714286, 17.90909091, 21.66666667, 18.45454545,
26.45454545, 46. , 30.66666667, 25.9 ,
31.85714286, 19.91666667, 15.28571429, 2. ,
16.83333333, 23.69230769, 47. , 36.5 ,
17.16666667, 50. , 31.28571429, 15.42857143,
10.83333333, 9.22222222, 25.72727273, 40.8 ,
27.90909091, 35. , 15.46153846, 24.08333333,
29.85714286, 11.66666667, 30.5 , 56.4 ,
7.25 , 24.25 , 19.53333333, 38.71428571,
23.08333333, 26.16666667, 18.57142857, 20.45454545,
27.25 , 13.71428571, 28.75 , 26.81818182,
31.08333333, 16.07142857, 15.2 , 33.5 ,
24.58333333, 19.625 , 10.42857143, 29.14285714,
23.57142857, 13.5 , 6.5 , 12.33333333,
22.14285714, 37.14285714, 21.77777778, 27.07142857,
7.5 , 30.28571429, 27.4 , 25.125 ,
10.8 , 15.1 , 22.6 , 49. ,
45. , 21.90909091, 146. , 26.2 ,
29.91666667, 31.25 , 21.86666667, 25.88888889,
28.83333333, 18.4 , 20.88888889, 18.88888889,
34.125 , 19.27777778, 23.25 , 25.77777778,
29.22222222, 24.375 , 20.16666667, 20.33333333,
32.125 , 20.55555556, 34.8 , 26.42857143,
15.33333333, 15.7 , 40. , 45.6 ,
21.63636364, 25.57142857, 28.22222222, 26.09090909,
12.42857143, 28.77777778, 12.4 , 39.33333333,
14.2 , 15.77777778, 32.9 , 18.8 ,
17.77777778, 54.5 , 29.88888889, 12.69230769,
23.75 , 19.71428571, 30.09090909, 44.5 ,
17.26666667, 37.33333333, 17.6 , 30.2 ,
35.33333333, 28.8 , 27.2 , 20.14285714,
9.25 , 24.57142857, 26.08333333, 29.4 ,
21.44444444, 21.9 , 19.21428571, 18.75 ,
25.75 , 13.93333333, 10.6 , 39.5 ,
42.8 , 31.42857143, 23.28571429, 17.375 ,
37.375 , 6.57692308, 34.14285714, 28.63636364,
16.55555556, 22.42857143, 42.28571429, 23.9375 ,
19.73333333, 37.5 , 13.28571429, 27.14285714,
22.71428571, 19.27272727, 24.88888889, 26.91666667,
28.25 , 23.61538462, 41.6 , 10.93333333,
15.5 , 22.08333333, 19.55555556, 23.4 ,
43.33333333, 57. , 25.42857143, 21.54545455,
24.2 , 22.8 , 15.4 , 15.75 ,
21.28571429, 36.75 , 34.42857143, 54.16666667,
18.125 , 41.5 , 32.75 , 6.66666667,
21.3 , 22.83333333, 8.14285714, 19.2 ,
14.42857143, 36.25 , 19.64705882, 32.4 ,
28.78571429, 24.4 , 35.81818182, 21.36363636,
31.22222222, 11.85714286, 17.4375 , 20.42857143,
27.63636364, 36.72727273, 27.28571429, 16.75 ,
13.16666667, 29.125 , 52. , 32.7 ,
32.33333333, 25.14285714, 12.6 , 22.28571429,
14.57142857, 18.05263158, 16.71428571, 22.2 ,
25.71428571, 26.71428571, 32.72727273, 63. ,
20.7 , 29.84615385, 17.88888889, 31.14285714,
8.5 , 18.33333333, 33.33333333, 28.55555556,
20.83333333, 7.2 , 27.45454545, 31.72727273,
25.27777778, 25.78571429, 11.63636364, 33.88888889,
25.83333333, 25.09090909, 11.25 , 23.875 ,
25.18181818, 29.42857143, 30.85714286, 29.27272727,
29.9 , 14.55555556, 21.71428571, 20.76190476,
27.5625 , 30.625 , 19.6 , 4.66666667,
14.14285714, 13.83333333, 37.4 , 19.44444444,
58. , 20.07142857, 21.85714286, 16.90909091,
20.11111111, 18.06666667, 21.125 , 41.85714286,
21.13333333, 12.25 , 22.3 , 29.09090909,
17.44444444, 24.23076923, 33.42857143, 22.84615385,
24.55555556, 20.85714286, 48.75 , 13.625 ,
19.41666667, 17.2 , 53.2 , 13.42857143,
11.6 , 31.625 , 18.9047619 , 19.16666667,
16.2 , 22.09090909, 24.6 , 23.3 ,
22.76923077, 20.2 , 29.71428571, 25.4 ,
21.88888889, 19.125 , 12.375 , 24.45454545,
7.36363636, 20.92857143, 29.875 , 31.6 ,
37.6 , 21.4 , 18.27272727, 33.11111111,
21.1 , 11.4 , 22.30769231, 31.92307692,
24.91666667, 27.57142857, 32.6 , 22.63636364,
11.45454545, 22.55555556, 17.75 , 22.44444444,
10.22727273, 24.07692308, 27.9 , 32.16666667,
31.2 , 31.66666667, 19.42857143, 24.81818182,
28.125 , 24.77777778, 25.875 , 25.91666667,
15.16666667, 18.875 , 29.375 , 26.41666667,
32.83333333, 19.18181818, 29.36363636, 23.83333333,
37.8 , 20.8 , 20.46666667, 24.83333333,
8.52941176, 15.61538462, 29.2 , 14.83333333,
28.21428571, 12.28571429, 13.55555556, 16.28571429,
17.83333333, 23.18181818, 23.14285714, 16.44444444,
18.28571429, 1. , 12.8 , 28.375 ,
16.42857143, 25.07692308, 23.26666667, 6.625 ,
25.6 , 37.625 , 18.07142857, 26.57142857,
23.11111111, 28.88888889, 25.85714286, 20.875 ,
6.75 , 27.6 , 22.27272727, 17.22222222,
24.92307692, 38.54545455, 23.92857143, 33.63636364,
23.84615385, 22.88888889, 24.85714286, 34.6 ,
22.875 , 31.4 , 31.16666667, 21.57142857,
51. , 15.8 , 28.85714286, 15.85714286,
16.16666667, 15.26666667, 34.16666667, 15.92307692,
75.66666667, 20.91666667, 24.71428571, 17.875 ,
42.6 , 30.08333333, 31.55555556, 23.22222222,
24.8 , 48. , 30.38461538, 16.88888889,
31.76923077, 19.63157895, 30.92857143, 43.2 ,
22.625 , 21.16666667, 34.88888889, 13.72727273,
28.38461538, 14.61904762, 25.375 , 42.5 ,
13.08333333, 16.8 , 48.66666667, 14.77777778,
31.71428571, 19.375 , 17.23076923, 6.33333333,
32.8 , 19.72222222, 19.77777778, 61. ,
22.7 , 5.66666667, 62. , 28.28571429,
13.75 , 34.55555556, 13.8 , 27.53846154,
27.92307692, 30.125 , 31.77777778, 23.76923077,
28.44444444, 29.57142857, 22.375 , 22.125 ,
21.09090909, 33.08333333, 23.16666667, 19.09090909,
32.1 , 23.88888889, 71.5 , 26.22222222,
20.09090909, 35.16666667, 33.55555556, 6.4 ,
30.81818182, 31.125 , 25.16666667, 21.42857143,
33.2 , 17.09090909, 14.28571429, 32.2 ,
17.3 , 31.33333333, 18.625 , 10.91666667,
24.28571429, 21.41666667, 30.14285714, 13.57142857,
14.71428571, 23.42857143, 28.61538462, 151. ,
13.14285714, 24.22222222, 37.75 , 45.57142857]) Start coding or generate with AI.
Start coding or generate with AI.
Start coding or generate with AI.
Start coding or generate with AI.
# ===== COMPREHENSIVE MODEL EVALUATION =====
print("\n🎯 COMPREHENSIVE MODEL PERFORMANCE OVERVIEW")
print("=" * 50)
# Additional metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# Calculate all key metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
print(f"📈 DETAILED METRICS:")
print(f" Accuracy: {accuracy:.3f}")
print(f" Precision: {precision:.3f}")
print(f" Recall: {recall:.3f}")
print(f" F1-Score: {f1:.3f}")
print(f" ROC-AUC: {test_roc_auc:.3f}")
print(f" PR-AUC: {test_pr_auc:.3f}")
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print(f"\n🔢 CONFUSION MATRIX:")
print(f" True Negatives: {cm[0,0]}")
print(f" False Positives: {cm[0,1]}")
print(f" False Negatives: {cm[1,0]}")
print(f" True Positives: {cm[1,1]}")
🎯 COMPREHENSIVE MODEL PERFORMANCE OVERVIEW ================================================== 📈 DETAILED METRICS: Accuracy: 0.740 Precision: 0.852 Recall: 0.808 F1-Score: 0.829 ROC-AUC: 0.709 PR-AUC: 0.886 🔢 CONFUSION MATRIX: True Negatives: 71 False Positives: 72 False Negatives: 98 True Positives: 413
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, precision_recall_curve
# Create a comprehensive evaluation plot
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Model Performance Evaluation', fontsize=16, fontweight='bold')
# 1. ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
axes[0, 0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {test_roc_auc:.3f})')
axes[0, 0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.8)
axes[0, 0].set_xlim([0.0, 1.0])
axes[0, 0].set_ylim([0.0, 1.05])
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('ROC Curve')
axes[0, 0].legend(loc="lower right")
axes[0, 0].grid(True, alpha=0.3)
# 2. Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_test_proba)
axes[0, 1].plot(recall_vals, precision_vals, color='blue', lw=2, label=f'PR curve (AUC = {test_pr_auc:.3f})')
axes[0, 1].set_xlim([0.0, 1.0])
axes[0, 1].set_ylim([0.0, 1.05])
axes[0, 1].set_xlabel('Recall')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_title('Precision-Recall Curve')
axes[0, 1].legend(loc="upper right")
axes[0, 1].grid(True, alpha=0.3)
# 3. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Predicted 0', 'Predicted 1'],
yticklabels=['Actual 0', 'Actual 1'], ax=axes[1, 0])
axes[1, 0].set_title('Confusion Matrix')
# 4. Feature Importance (if available)
if hasattr(final_model, 'feature_importances_'):
# Get top 10 most important features
feature_names = X_train.columns if hasattr(X_train, 'columns') else [f'Feature_{i}' for i in range(X_train.shape[1])]
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False).head(10)
axes[1, 1].barh(range(len(importance_df)), importance_df['importance'])
axes[1, 1].set_yticks(range(len(importance_df)))
axes[1, 1].set_yticklabels(importance_df['feature'])
axes[1, 1].set_xlabel('Feature Importance')
axes[1, 1].set_title('Top 10 Feature Importances')
axes[1, 1].invert_yaxis()
plt.tight_layout()
plt.show()
Start coding or generate with AI.
import joblib
import json
# Save the trained model in Colab
joblib.dump(final_model, '/content/loan_default_model.pkl')
# Save model info
model_info['feature_names'] = list(X_train.columns) if hasattr(X_train, 'columns') else None
with open('/content/model_info.json', 'w') as f:
json.dump(model_info, f, indent=2)
print("✅ Model saved in Colab at /content/loan_default_model.pkl")
print("✅ Model info saved at /content/model_info.json")
# Verify files exist
import os
print(f"Model file exists: {os.path.exists('/content/loan_default_model.pkl')}")
print(f"Model info file exists: {os.path.exists('/content/model_info.json')}")
✅ Model saved in Colab at /content/loan_default_model.pkl ✅ Model info saved at /content/model_info.json Model file exists: True Model info file exists: True
# Install required packages
!pip install flask flask-ngrok pyngrok
# Import ngrok for public URL
from pyngrok import ngrok
import threading
import time
Requirement already satisfied: flask in /usr/local/lib/python3.12/dist-packages (3.1.1) Collecting flask-ngrok Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB) Requirement already satisfied: pyngrok in /usr/local/lib/python3.12/dist-packages (7.3.0) Requirement already satisfied: blinker>=1.9.0 in /usr/local/lib/python3.12/dist-packages (from flask) (1.9.0) Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.12/dist-packages (from flask) (8.2.1) Requirement already satisfied: itsdangerous>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from flask) (2.2.0) Requirement already satisfied: jinja2>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from flask) (3.1.6) Requirement already satisfied: markupsafe>=2.1.1 in /usr/local/lib/python3.12/dist-packages (from flask) (3.0.2) Requirement already satisfied: werkzeug>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from flask) (3.1.3) Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from flask-ngrok) (2.32.4) Requirement already satisfied: PyYAML>=5.1 in /usr/local/lib/python3.12/dist-packages (from pyngrok) (6.0.2) Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (3.4.3) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (2.5.0) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (2025.8.3) Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB) Installing collected packages: flask-ngrok Successfully installed flask-ngrok-0.0.25
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 59.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 524.0/524.0 kB 29.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.9/6.9 MB 71.1 MB/s eta 0:00:00
Start coding or generate with AI.
from flask import Flask, request, jsonify
import joblib
import pandas as pd
import numpy as np
import json
import os
app = Flask(__name__)
# Load model with correct Colab path
try:
model = joblib.load('/content/loan_default_model.pkl')
with open('/content/model_info.json', 'r') as f:
model_info = json.load(f)
print("✅ Model loaded successfully in Colab!")
except FileNotFoundError as e:
print(f"❌ Model file not found: {e}")
model = None
model_info = None
@app.route('/')
def home():
if model is None:
return '<h1>❌ Model Not Found</h1><p>Please save your model first!</p>'
return f'''
<h1>🎯 Loan Default Predictor API</h1>
<p>✅ Model loaded successfully in Google Colab!</p>
<p><strong>Model Type:</strong> {model_info.get('model_type', 'Unknown')}</p>
<p><strong>Test ROC-AUC:</strong> {model_info.get('test_roc_auc', 'N/A'):.3f}</p>
<p><strong>Features:</strong> {model_info.get('features_shape', 'Unknown')}</p>
<h3>📡 Available Endpoints:</h3>
<ul>
<li><strong>GET /</strong> - This page</li>
<li><strong>GET /health</strong> - Health check</li>
<li><strong>POST /predict</strong> - Make predictions</li>
</ul>
<h3>🧪 Test Prediction:</h3>
<p>Send POST request to /predict with your features as JSON</p>
'''
@app.route('/health')
def health():
if model is None:
return jsonify({'status': 'error', 'message': 'Model not loaded'}), 500
return jsonify({
'status': 'healthy',
'platform': 'Google Colab',
'model_type': model_info.get('model_type'),
'test_roc_auc': model_info.get('test_roc_auc'),
'features_count': model_info.get('features_shape')
})
@app.route('/predict', methods=['POST'])
def predict():
if model is None:
return jsonify({'error': 'Model not loaded'}), 500
try:
data = request.get_json()
if not data:
return jsonify({'error': 'No data provided'}), 400
# Convert to DataFrame
df = pd.DataFrame([data])
# Ensure correct feature order
if model_info.get('feature_names'):
df = df.reindex(columns=model_info['feature_names'], fill_value=0)
# Make prediction
prediction = model.predict(df)[0]
probability = model.predict_proba(df)[0][1]
# Risk assessment
if probability > 0.7:
risk_level = 'High Risk'
recommendation = 'Loan application should be rejected'
elif probability > 0.4:
risk_level = 'Medium Risk'
recommendation = 'Loan application needs additional review'
else:
risk_level = 'Low Risk'
recommendation = 'Loan application can be approved'
return jsonify({
'prediction': int(prediction),
'probability': float(probability),
'probability_percent': f"{probability * 100:.1f}%",
'risk_level': risk_level,
'recommendation': recommendation,
'timestamp': str(pd.Timestamp.now())
})
except Exception as e:
return jsonify({'error': f'Prediction failed: {str(e)}'}), 400
# Function to run Flask in background
def run_flask():
app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)
print("Flask app created successfully!")
✅ Model loaded successfully in Colab! Flask app created successfully!
# === STEP 1: Create Streamlit app that uses your existing data ===
%%writefile view_my_model.py
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pickle
import joblib
# Set page config
st.set_page_config(
page_title="Loan Defalt Risk Predictor App",
page_icon="🏦",
layout="wide"
)
st.title("🏦 My Loan Model Dashboard")
st.markdown("### Viewing Your Trained Model & Data")
# Load your data and model (you'll need to modify these paths)
@st.cache_data
def load_your_data():
# MODIFY THESE PATHS TO MATCH YOUR ACTUAL VARIABLES
# Option 1: If your data is in a DataFrame variable called 'df'
try:
# This will work if you run this after your existing code
return globals().get('df', None)
except:
# Option 2: Load from file
# return pd.read_csv('your_data.csv')
# Option 3: Create sample data for demo
np.random.seed(42)
n_samples = 1000
data = {
'loanamount': np.random.uniform(1000, 50000, n_samples),
'termdays': np.random.choice([30, 60, 90, 120, 180, 365], n_samples),
'good_bad_flag': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
'bank_account_type': np.random.choice(['Savings', 'Current', 'Fixed'], n_samples),
'employment_status_clients': np.random.choice(['Permanent', 'Temporary', 'Unemployed', 'Self-employed'], n_samples),
'birth_year': np.random.randint(1960, 2000, n_samples),
'num_prev_loans': np.random.poisson(2, n_samples),
'avg_repay_delay_days': np.random.exponential(5, n_samples),
'age': 2024 - np.random.randint(1960, 2000, n_samples),
'interest_curr_rate': np.random.uniform(0.05, 0.25, n_samples),
'repayment_curr_ratio': np.random.uniform(0.8, 1.2, n_samples),
'repayment_efficiency': np.random.uniform(0.6, 1.0, n_samples),
'late_payment_rate': np.random.exponential(0.1, n_samples),
}
return pd.DataFrame(data)
@st.cache_resource
def load_your_model():
# MODIFY THIS TO LOAD YOUR ACTUAL MODEL
try:
# Option 1: If your model is in a variable
return globals().get('model', None)
except:
# Option 2: Load from file
# return joblib.load('your_model.pkl')
# return pickle.load(open('your_model.pkl', 'rb'))
return None
# Load data and model
df = load_your_data()
model = load_your_model()
if df is not None:
# Basic info
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Records", f"{len(df):,}")
with col2:
st.metric("Features", f"{df.shape[1]}")
with col3:
if 'good_bad_flag' in df.columns:
default_rate = df['good_bad_flag'].mean() * 100
st.metric("Default Rate", f"{default_rate:.1f}%")
else:
st.metric("Default Rate", "N/A")
with col4:
if model is not None:
st.metric("Model Status", "✅ Loaded")
else:
st.metric("Model Status", "❌ Not Found")
# Display data sample
st.subheader("📊 Your Data Sample")
st.dataframe(df.head(10), use_container_width=True)
# Data info
st.subheader("📈 Data Info")
col1, col2 = st.columns(2)
with col1:
st.write("**Column Types:**")
dtype_info = pd.DataFrame({
'Column': df.columns,
'Type': df.dtypes,
'Non-Null': df.count(),
'Null Count': df.isnull().sum()
})
st.dataframe(dtype_info, use_container_width=True)
with col2:
st.write("**Numeric Summary:**")
if len(df.select_dtypes(include=[np.number]).columns) > 0:
st.dataframe(df.describe(), use_container_width=True)
# Feature distributions
st.subheader("📊 Feature Analysis")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if numeric_cols:
selected_feature = st.selectbox("Select feature to analyze:", numeric_cols)
col1, col2 = st.columns(2)
with col1:
# Distribution
fig = px.histogram(df, x=selected_feature, title=f"Distribution of {selected_feature}")
st.plotly_chart(fig, use_container_width=True)
with col2:
# Box plot by target if available
if 'good_bad_flag' in df.columns:
fig = px.box(df, x='good_bad_flag', y=selected_feature,
title=f"{selected_feature} by Risk")
st.plotly_chart(fig, use_container_width=True)
else:
# Just show basic stats
stats = df[selected_feature].describe()
st.write(f"**{selected_feature} Statistics:**")
for stat, value in stats.items():
st.write(f"- {stat.title()}: {value:.2f}")
# Correlation analysis
if len(numeric_cols) > 1:
st.subheader("🔗 Feature Correlations")
corr_matrix = df[numeric_cols].corr()
fig = px.imshow(corr_matrix,
title="Correlation Heatmap",
color_continuous_scale='RdBu')
st.plotly_chart(fig, use_container_width=True)
# Model predictions (if model is available)
if model is not None:
st.subheader("🤖 Model Predictions")
try:
# Try to make predictions
X = df.select_dtypes(include=[np.number])
if hasattr(model, 'predict'):
predictions = model.predict(X)
prediction_proba = None
if hasattr(model, 'predict_proba'):
prediction_proba = model.predict_proba(X)
# Add predictions to display
display_df = df.copy()
display_df['Prediction'] = predictions
if prediction_proba is not None:
display_df['Probability'] = prediction_proba[:, 1] if prediction_proba.shape[1] > 1 else prediction_proba.flatten()
st.write("**Sample Predictions:**")
st.dataframe(display_df[['Prediction'] + (['Probability'] if 'Probability' in display_df.columns else [])].head(10))
# Prediction distribution
pred_counts = pd.Series(predictions).value_counts()
fig = px.bar(x=pred_counts.index, y=pred_counts.values,
title="Prediction Distribution")
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Model doesn't have a predict method")
except Exception as e:
st.error(f"Error making predictions: {e}")
st.info("This might be due to feature mismatch or model format")
# Model performance (if target variable exists)
if 'good_bad_flag' in df.columns and model is not None:
st.subheader("📈 Model Performance")
try:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
X = df.select_dtypes(include=[np.number])
y_true = df['good_bad_flag']
y_pred = model.predict(X)
# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
# Display metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{accuracy:.3f}")
with col2:
st.metric("Precision", f"{precision:.3f}")
with col3:
st.metric("Recall", f"{recall:.3f}")
with col4:
st.metric("F1-Score", f"{f1:.3f}")
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
fig = px.imshow(cm, text_auto=True, aspect="auto",
title="Confusion Matrix")
st.plotly_chart(fig, use_container_width=True)
except Exception as e:
st.error(f"Error calculating performance metrics: {e}")
else:
st.error("❌ Could not load your data. Please check the data loading section in the code.")
# Instructions for user
st.sidebar.header("📝 Instructions")
st.sidebar.markdown("""
**To use with your actual data:**
1. **Modify the `load_your_data()` function** to point to your DataFrame variable or file
2. **Modify the `load_your_model()` function** to load your trained model
3. **Common variable names to try:**
- `df`, `data`, `loan_data`
- `model`, `clf`, `classifier`
4. **File loading examples:**
```python
# For CSV
return pd.read_csv('your_file.csv')
# For model
return joblib.load('model.pkl')
return pickle.load(open('model.pkl', 'rb'))
```
""")
# === STEP 2: Quick setup to run with your existing variables ===
# First, let's try to detect what variables you have available
print("🔍 Detecting your variables...")
print("Available DataFrames:")
for var_name in dir():
var_obj = eval(var_name)
if isinstance(var_obj, pd.DataFrame):
print(f" 📊 {var_name}: {var_obj.shape} - {list(var_obj.columns[:5])}...")
print("\nAvailable potential models:")
for var_name in dir():
var_obj = eval(var_name)
if hasattr(var_obj, 'predict') and hasattr(var_obj, 'fit'):
print(f" 🤖 {var_name}: {type(var_obj)}")
# === STEP 3: Simple inline viewer (works immediately) ===
print("\n" + "="*50)
print("📊 QUICK DATA OVERVIEW")
print("="*50)
# Try to find your data automatically
data_vars = []
model_vars = []
for var_name in dir():
if not var_name.startswith('_'):
try:
var_obj = eval(var_name)
if isinstance(var_obj, pd.DataFrame) and len(var_obj) > 0:
data_vars.append((var_name, var_obj))
elif hasattr(var_obj, 'predict') and hasattr(var_obj, 'fit'):
model_vars.append((var_name, var_obj))
except:
pass
if data_vars:
var_name, df_found = data_vars[0] # Use first DataFrame found
print(f"\n✅ Found DataFrame: {var_name}")
print(f"Shape: {df_found.shape}")
print(f"Columns: {list(df_found.columns)}")
print(f"\nFirst 5 rows:")
print(df_found.head())
if 'good_bad_flag' in df_found.columns:
default_rate = df_found['good_bad_flag'].mean() * 100
print(f"\n📊 Default Rate: {default_rate:.1f}%")
if model_vars:
model_name, model_found = model_vars[0]
print(f"\n✅ Found Model: {model_name}")
print(f"Type: {type(model_found)}")
print(f"\n🚀 To launch full dashboard, run the Streamlit app created above!")
Writing view_my_model.py
# === STEP 2: Detect your existing variables ===
print("🔍 Detecting your variables...")
print("Available DataFrames:")
for var_name in dir():
if not var_name.startswith('_'):
try:
var_obj = globals()[var_name]
if isinstance(var_obj, pd.DataFrame):
print(f" 📊 {var_name}: {var_obj.shape} - {list(var_obj.columns[:5])}...")
except:
pass
print("\nAvailable potential models:")
for var_name in dir():
if not var_name.startswith('_'):
try:
var_obj = globals()[var_name]
if hasattr(var_obj, 'predict') and hasattr(var_obj, 'fit'):
print(f" 🤖 {var_name}: {type(var_obj)}")
except:
pass
# Show quick preview of your data
data_found = False
for var_name in dir():
if not var_name.startswith('_'):
try:
var_obj = globals()[var_name]
if isinstance(var_obj, pd.DataFrame) and len(var_obj) > 10:
print(f"\n✅ Found your DataFrame: '{var_name}'")
print(f"Shape: {var_obj.shape}")
print(f"Columns: {list(var_obj.columns)}")
print(f"\nSample data:")
print(var_obj.head(3))
if 'good_bad_flag' in var_obj.columns:
default_rate = var_obj['good_bad_flag'].mean() * 100
print(f"\n📊 Default Rate: {default_rate:.1f}%")
data_found = True
break
except:
pass
if not data_found:
print("\n❌ No DataFrame found. Make sure your data is loaded in a pandas DataFrame.")
🔍 Detecting your variables... Available DataFrames: 📊 X: (3269, 18) - ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year']... 📊 X_test: (654, 18) - ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year']... 📊 X_train: (2615, 18) - ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year']... 📊 X_train_processed_df: (2615, 5153) - ['birth_year', 'num_prev_loans', 'avg_repay_delay_days', 'total_firstrepaid_late', 'avg_prev_repayment_ratio']... 📊 df_demo: (4334, 8) - ['customerid', 'birthdate', 'bank_account_type', 'longitude_gps', 'latitude_gps']... 📊 df_final: (3269, 24) - ['loanamount', 'termdays', 'good_bad_flag', 'approved_time', 'creation_time']... 📊 df_main: (3269, 18) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']... 📊 df_perf: (4368, 11) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']... 📊 df_prevloans: (18183, 17) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']... 📊 df_prevloans_agg: (4359, 9) - ['customerid', 'num_prev_loans', 'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late']... 📊 df_results: (1, 6) - ['Train Accuracy', 'Test Accuracy', 'Precision Score', 'Recall Score', 'F1 Score']... 📊 heatmap: (6, 6) - ['sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency']... 📊 metrics_df: (6, 6) - ['Train Accuracy', 'Test Accuracy', 'Precision Score', 'Recall Score', 'F1 Score']... 📊 missing_ids: (5, 18) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']... Available potential models: 🤖 CatBoostClassifier: <class 'type'> 🤖 DecisionTreeClassifier: <class 'abc.ABCMeta'> 🤖 GradientBoostingClassifier: <class 'abc.ABCMeta'> 🤖 GridSearchCV: <class 'abc.ABCMeta'> 🤖 LogisticRegression: <class 'type'> 🤖 Pipeline: <class 'abc.ABCMeta'> 🤖 RandomForestClassifier: <class 'abc.ABCMeta'> 🤖 RandomizedSearchCV: <class 'abc.ABCMeta'> 🤖 XGBClassifier: <class 'type'> 🤖 best_gb_model: <class 'imblearn.pipeline.Pipeline'> 🤖 final_model: <class 'imblearn.pipeline.Pipeline'> 🤖 gb_pipeline: <class 'imblearn.pipeline.Pipeline'> 🤖 gb_random: <class 'sklearn.model_selection._search.RandomizedSearchCV'> 🤖 model: <class 'imblearn.pipeline.Pipeline'> 🤖 pipeline: <class 'imblearn.pipeline.Pipeline'> ✅ Found your DataFrame: 'X' Shape: (3269, 18) Columns: ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year', 'num_prev_loans', 'avg_repay_delay_days', 'total_firstrepaid_late', 'avg_prev_repayment_ratio', 'avg_duration_days', 'age', 'interest_curr_rate', 'repayment_curr_ratio', 'sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency'] Sample data: approved_time creation_time bank_account_type employment_status_clients \ 0 08:22:56 07:22:47 Other Permanent 1 17:04:41 16:04:18 Savings Permanent 2 14:52:57 13:52:51 Other Permanent birth_year num_prev_loans avg_repay_delay_days total_firstrepaid_late \ 0 1972 11.00 -0.91 3.00 1 1985 0.00 0.00 0.00 2 1984 6.00 0.83 1.00 avg_prev_repayment_ratio avg_duration_days age interest_curr_rate \ 0 1.23 29.45 45 0.15 1 0.00 0.00 31 0.15 2 1.18 18.17 32 0.11 repayment_curr_ratio sqrt_late_payment_rate sqrt_termdays \ 0 1.15 0.52 5.48 1 1.15 0.00 5.48 2 1.11 0.41 3.87 sqrt_loanamount sqrt_avg_prev_interest sqrt_repayment_efficiency 0 173.21 62.45 0.97 1 122.47 0.00 1072.38 2 141.42 41.83 0.97
# === Create the Loan Default Risk Predictor App ===
dashboard_code = '''import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
# Set page config
st.set_page_config(
page_title="Loan Default Risk Predictor",
page_icon="⚠️",
layout="wide"
)
# Header with branding
st.markdown("""
<div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #FF6B6B, #4ECDC4); border-radius: 10px; margin-bottom: 20px;'>
<h1 style='color: white; margin: 0;'>⚠️ Loan Default Risk Predictor App</h1>
<p style='color: white; margin: 5px 0 0 0; font-size: 18px;'>Advanced ML-Powered Credit Risk Assessment System</p>
</div>
""", unsafe_allow_html=True)
# Load your actual variables
try:
# Access your variables from the global environment
exec("from __main__ import df_final, final_model, X, X_test, y_test", globals())
df = df_final
model = final_model
data_loaded = True
st.sidebar.success("✅ Data & Model Loaded Successfully")
except Exception as e:
st.error(f"⚠️ Could not load your data variables: {e}")
data_loaded = False
st.sidebar.error("❌ Data Loading Failed")
if data_loaded:
# Key Performance Indicators Dashboard
st.markdown("### 📊 Risk Assessment Dashboard")
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("📋 Total Loans", f"{len(df):,}")
with col2:
st.metric("🔢 Features", f"{df.shape[1]}")
with col3:
if 'good_bad_flag' in df.columns:
default_rate = df['good_bad_flag'].mean() * 100
st.metric("⚠️ Default Rate", f"{default_rate:.1f}%",
delta=f"{default_rate-20:.1f}%" if default_rate > 20 else f"+{20-default_rate:.1f}%")
else:
st.metric("⚠️ Default Rate", "N/A")
with col4:
if 'loanamount' in df.columns:
avg_loan = df['loanamount'].mean()
st.metric("💰 Avg Loan", f"${avg_loan:,.0f}")
else:
st.metric("💰 Avg Loan", "N/A")
with col5:
st.metric("🤖 ML Model", "Active", delta="Trained")
# Main Navigation Tabs
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"🏠 Overview",
"📈 Risk Analytics",
"🎯 Predictive Insights",
"🤖 Model Performance",
"⚡ Live Predictions"
])
with tab1:
st.markdown("## 🏠 Loan Portfolio Overview")
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("📋 Recent Loan Applications")
display_df = df.head(15).copy()
# Add risk labels for better understanding
if 'good_bad_flag' in display_df.columns:
display_df['Risk_Status'] = display_df['good_bad_flag'].map({0: '✅ Low Risk', 1: '⚠️ High Risk'})
st.dataframe(display_df, use_container_width=True)
with col2:
st.subheader("📊 Portfolio Health")
if 'good_bad_flag' in df.columns:
# Risk distribution pie chart
risk_counts = df['good_bad_flag'].value_counts()
fig = px.pie(
values=risk_counts.values,
names=['✅ Good Loans', '⚠️ Risky Loans'],
title="Loan Risk Distribution",
color_discrete_sequence=['#2E8B57', '#FF6347']
)
st.plotly_chart(fig, use_container_width=True)
# Risk metrics
total_loans = len(df)
risky_loans = df['good_bad_flag'].sum()
safe_loans = total_loans - risky_loans
st.metric("🟢 Safe Loans", f"{safe_loans:,}", f"{(safe_loans/total_loans)*100:.1f}%")
st.metric("🔴 Risky Loans", f"{risky_loans:,}", f"{(risky_loans/total_loans)*100:.1f}%")
# Data Quality Assessment
st.subheader("🔍 Data Quality Report")
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
st.warning("⚠️ Missing data detected in some fields")
missing_df = missing_data[missing_data > 0].sort_values(ascending=False)
fig = px.bar(x=missing_df.values, y=missing_df.index, orientation='h',
title="Missing Values by Feature", color_discrete_sequence=['#FFA500'])
st.plotly_chart(fig, use_container_width=True)
else:
st.success("✅ Data quality excellent - no missing values detected!")
with tab2:
st.markdown("## 📈 Risk Analytics Dashboard")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
col1, col2 = st.columns([1, 3])
with col1:
st.subheader("🎛️ Analytics Controls")
selected_feature = st.selectbox("📊 Select Risk Factor:", numeric_cols)
# Feature statistics
if selected_feature in df.columns:
feature_stats = df[selected_feature].describe()
st.markdown("**📈 Quick Stats:**")
st.write(f"**Mean:** {feature_stats['mean']:.2f}")
st.write(f"**Std:** {feature_stats['std']:.2f}")
st.write(f"**Min:** {feature_stats['min']:.2f}")
st.write(f"**Max:** {feature_stats['max']:.2f}")
with col2:
# Feature distribution
fig_dist = px.histogram(df, x=selected_feature, nbins=30,
title=f"📊 Distribution of {selected_feature}",
color_discrete_sequence=['#4ECDC4'])
st.plotly_chart(fig_dist, use_container_width=True)
# Risk comparison
if 'good_bad_flag' in df.columns:
fig_box = px.box(df, x='good_bad_flag', y=selected_feature,
title=f"🎯 {selected_feature} by Risk Category",
color='good_bad_flag',
color_discrete_sequence=['#2E8B57', '#FF6347'])
fig_box.update_xaxes(tickvals=[0, 1], ticktext=['✅ Good Risk', '⚠️ Bad Risk'])
st.plotly_chart(fig_box, use_container_width=True)
# Risk Factor Correlation Analysis
if 'good_bad_flag' in df.columns and len(numeric_cols) > 1:
st.subheader("🎯 Risk Factor Impact Analysis")
correlations = df[numeric_cols].corrwith(df['good_bad_flag']).abs().sort_values(ascending=True)
fig = px.bar(x=correlations.values, y=correlations.index,
orientation='h', title="🔥 Feature Impact on Default Risk",
color=correlations.values, color_continuous_scale='Reds')
st.plotly_chart(fig, use_container_width=True)
with tab3:
st.markdown("## 🎯 Predictive Risk Insights")
if 'good_bad_flag' in df.columns:
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
categorical_features = [col for col in categorical_cols if df[col].nunique() < 20]
if categorical_features:
st.subheader("🔍 Risk Analysis by Categories")
selected_cat = st.selectbox("📋 Analyze risk by:", categorical_features)
# Risk analysis
risk_data = df.groupby(selected_cat)['good_bad_flag'].agg(['count', 'mean']).reset_index()
risk_data.columns = [selected_cat, 'Total_Loans', 'Default_Rate']
risk_data['Default_Rate_Pct'] = risk_data['Default_Rate'] * 100
risk_data = risk_data.sort_values('Default_Rate_Pct', ascending=False)
# Enhanced visualization
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Bar(x=risk_data[selected_cat], y=risk_data['Total_Loans'],
name="📊 Number of Loans", marker_color='lightblue',
opacity=0.7),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=risk_data[selected_cat], y=risk_data['Default_Rate_Pct'],
mode='lines+markers+text', name="⚠️ Default Rate (%)",
line=dict(color='red', width=4),
marker=dict(size=10),
textposition="top center"),
secondary_y=True,
)
fig.update_xaxes(title_text=f"📋 {selected_cat}")
fig.update_yaxes(title_text="📊 Number of Loans", secondary_y=False)
fig.update_yaxes(title_text="⚠️ Default Rate (%)", secondary_y=True)
fig.update_layout(title=f"🎯 Risk Analysis by {selected_cat}", height=500)
st.plotly_chart(fig, use_container_width=True)
# Risk ranking table
st.subheader("📊 Risk Ranking by Category")
risk_data_display = risk_data.copy()
risk_data_display['Risk_Level'] = pd.cut(risk_data_display['Default_Rate_Pct'],
bins=[0, 15, 30, 100],
labels=['🟢 Low', '🟡 Medium', '🔴 High'])
st.dataframe(risk_data_display, use_container_width=True)
with tab4:
st.markdown("## 🤖 ML Model Performance Dashboard")
try:
# Model predictions
y_pred = model.predict(X_test)
# Performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Try to get probability predictions for AUC
try:
y_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_proba)
except:
auc_score = None
# Performance metrics display
st.subheader("📈 Model Performance Metrics")
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("🎯 Accuracy", f"{accuracy:.1%}",
delta=f"+{(accuracy-0.8)*100:.1f}%" if accuracy > 0.8 else f"{(accuracy-0.8)*100:.1f}%")
with col2:
st.metric("🎯 Precision", f"{precision:.1%}")
with col3:
st.metric("🎯 Recall", f"{recall:.1%}")
with col4:
st.metric("🎯 F1-Score", f"{f1:.1%}")
with col5:
if auc_score:
st.metric("🎯 AUC Score", f"{auc_score:.1%}")
else:
st.metric("🎯 AUC Score", "N/A")
col1, col2 = st.columns(2)
with col1:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm, text_auto=True, aspect="auto",
title="🎯 Confusion Matrix",
labels=dict(x="Predicted", y="Actual"),
color_continuous_scale='Blues')
fig.update_xaxes(tickvals=[0, 1], ticktext=['Good', 'Bad'])
fig.update_yaxes(tickvals=[0, 1], ticktext=['Good', 'Bad'])
st.plotly_chart(fig, use_container_width=True)
with col2:
# Prediction distribution
pred_counts = pd.Series(y_pred).value_counts()
fig = px.pie(values=pred_counts.values,
names=['✅ Predicted Good', '⚠️ Predicted Bad'],
title="🔮 Model Predictions Distribution",
color_discrete_sequence=['#2E8B57', '#FF6347'])
st.plotly_chart(fig, use_container_width=True)
# Model insights
st.subheader("🔍 Model Insights")
total_test = len(y_test)
correct_predictions = (y_pred == y_test).sum()
st.info(f"""
**🤖 Model Summary:**
- Tested on {total_test:,} loan applications
- Correctly identified {correct_predictions:,} cases ({accuracy:.1%})
- Model Type: {type(model).__name__}
""")
except Exception as e:
st.error(f"⚠️ Error in model evaluation: {e}")
st.info("💡 Make sure your test data (X_test, y_test) is available")
with tab5:
st.markdown("## ⚡ Live Risk Prediction")
st.info("🚀 **Coming Soon**: Interactive loan risk calculator where you can input loan parameters and get instant risk predictions!")
# Placeholder for live prediction interface
st.subheader("🎛️ Loan Risk Calculator")
col1, col2 = st.columns(2)
with col1:
loan_amount = st.number_input("💰 Loan Amount ($)", min_value=1000, max_value=100000, value=25000)
term_days = st.selectbox("📅 Loan Term (Days)", [30, 60, 90, 120, 180, 365])
employment_status = st.selectbox("💼 Employment Status", ['Permanent', 'Temporary', 'Self-employed', 'Unemployed'])
with col2:
age = st.slider("👤 Applicant Age", 18, 80, 35)
account_type = st.selectbox("🏦 Account Type", ['Savings', 'Current', 'Other'])
prev_loans = st.number_input("📋 Previous Loans", min_value=0, max_value=20, value=2)
if st.button("🔮 Predict Default Risk", type="primary"):
st.warning("⚡ Live prediction feature will be implemented with your specific model requirements!")
st.balloons()
else:
st.error("❌ Could not load your data. Make sure df_final and final_model are available.")
st.markdown("""
### 🔧 Troubleshooting:
1. Ensure your variables `df_final` and `final_model` are loaded
2. Run this dashboard in the same environment as your model training
3. Check that all required libraries are installed
""")
# Sidebar with app info
st.sidebar.markdown("""
---
### ⚠️ Loan Default Risk Predictor
**🎯 Features:**
- Advanced ML risk assessment
- Real-time portfolio monitoring
- Interactive risk analytics
- Performance dashboards
**📊 Your Data:**
- Dataset: Loan applications
- Model: Gradient Boosting Pipeline
- Features: 24+ risk factors
""")
st.sidebar.markdown("---")
st.sidebar.markdown("🏦 **Built with:** Streamlit • Plotly • Scikit-learn")
# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center; color: #666; padding: 20px;'>
⚠️ <strong>Loan Default Risk Predictor App</strong> | Powered by Machine Learning<br>
<small>Advanced Credit Risk Assessment • Real-time Analytics • Predictive Insights</small>
</div>
""", unsafe_allow_html=True)
'''
# Write the file
with open('loan_default_risk_predictor.py', 'w') as f:
f.write(dashboard_code)
print("✅ Loan Default Risk Predictor App created successfully!")
print("📱 App Name: Loan Default Risk Predictor")
print("🎯 Features: Advanced ML-powered credit risk assessment")
# Now run it
import subprocess
import threading
import time
def run_streamlit():
subprocess.run(["streamlit", "run", "loan_default_risk_predictor.py", "--server.port", "8501", "--server.address", "0.0.0.0"])
# Kill existing processes
!pkill -f streamlit
# Start streamlit
thread = threading.Thread(target=run_streamlit)
thread.daemon = True
thread.start()
print("🚀 Starting your Loan Default Risk Predictor App...")
time.sleep(5)
print("✅ App should be running on port 8501")
print("💡 Use Colab's port forwarding to access your professional risk assessment app!")
✅ Loan Default Risk Predictor App created successfully! 📱 App Name: Loan Default Risk Predictor 🎯 Features: Advanced ML-powered credit risk assessment 🚀 Starting your Loan Default Risk Predictor App... ✅ App should be running on port 8501 💡 Use Colab's port forwarding to access your professional risk assessment app!
# Install and import required packages
!pip install streamlit-webrtc
# Use Colab's port forwarding
import subprocess
from google.colab import output
import time
# Start Streamlit
process = subprocess.Popen([
"streamlit", "run", "your_app_file.py",
"--server.port", "8501",
"--server.headless", "true"
])
print("🚀 Streamlit is starting...")
time.sleep(10)
# Display the local URL - Colab should automatically detect it
print("🌐 Your app should be running at:")
print("📱 Check for a popup or notification from Colab about port forwarding")
print("🔗 Or try: https://colab.research.google.com/drive/your-notebook#scrollTo=your-cell")
Collecting streamlit-webrtc Downloading streamlit_webrtc-0.63.4-py3-none-any.whl.metadata (18 kB) Collecting aioice>=0.10.1 (from streamlit-webrtc) Downloading aioice-0.10.1-py3-none-any.whl.metadata (4.1 kB) Collecting aiortc>=1.11.0 (from streamlit-webrtc) Downloading aiortc-1.13.0-py3-none-any.whl.metadata (4.9 kB) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from streamlit-webrtc) (25.0) Requirement already satisfied: streamlit>=0.89.0 in /usr/local/lib/python3.12/dist-packages (from streamlit-webrtc) (1.48.1) Collecting dnspython>=2.0.0 (from aioice>=0.10.1->streamlit-webrtc) Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB) Collecting ifaddr>=0.2.0 (from aioice>=0.10.1->streamlit-webrtc) Downloading ifaddr-0.2.0-py3-none-any.whl.metadata (4.9 kB) Collecting av<15.0.0,>=14.0.0 (from aiortc>=1.11.0->streamlit-webrtc) Downloading av-14.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB) Requirement already satisfied: cffi>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from aiortc>=1.11.0->streamlit-webrtc) (1.17.1) Collecting cryptography>=44.0.0 (from aiortc>=1.11.0->streamlit-webrtc) Downloading cryptography-45.0.6-cp311-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB) Requirement already satisfied: google-crc32c>=1.1 in /usr/local/lib/python3.12/dist-packages (from aiortc>=1.11.0->streamlit-webrtc) (1.7.1) Collecting pyee>=13.0.0 (from aiortc>=1.11.0->streamlit-webrtc) Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB) Collecting pylibsrtp>=0.10.0 (from aiortc>=1.11.0->streamlit-webrtc) Downloading pylibsrtp-0.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB) Collecting pyopenssl>=25.0.0 (from aiortc>=1.11.0->streamlit-webrtc) Downloading pyopenssl-25.1.0-py3-none-any.whl.metadata (17 kB) Requirement already satisfied: altair!=5.4.0,!=5.4.1,<6,>=4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (5.5.0) Requirement already satisfied: blinker<2,>=1.5.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (1.9.0) Requirement already satisfied: cachetools<7,>=4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (5.5.2) Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (8.2.1) Requirement already satisfied: numpy<3,>=1.23 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (2.0.2) Requirement already satisfied: pandas<3,>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (2.2.2) Requirement already satisfied: pillow<12,>=7.1.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (11.3.0) Requirement already satisfied: protobuf<7,>=3.20 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (5.29.5) Requirement already satisfied: pyarrow>=7.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (18.1.0) Requirement already satisfied: requests<3,>=2.27 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (2.32.4) Requirement already satisfied: tenacity<10,>=8.1.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (8.5.0) Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (0.10.2) Requirement already satisfied: typing-extensions<5,>=4.4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (4.14.1) Requirement already satisfied: watchdog<7,>=2.1.5 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (6.0.0) Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (3.1.45) Requirement already satisfied: pydeck<1,>=0.8.0b4 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (0.9.1) Requirement already satisfied: tornado!=6.5.0,<7,>=6.0.3 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (6.4.2) Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (3.1.6) Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.12/dist-packages (from altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (4.25.1) Requirement already satisfied: narwhals>=1.14.2 in /usr/local/lib/python3.12/dist-packages (from altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (2.1.2) Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.0.0->aiortc>=1.11.0->streamlit-webrtc) (2.22) Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.12/dist-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit>=0.89.0->streamlit-webrtc) (4.0.12) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (2025.2) Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (3.4.3) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (2.5.0) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (2025.8.3) Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.12/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit>=0.89.0->streamlit-webrtc) (5.0.2) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (3.0.2) Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (25.3.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (2025.4.1) Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (0.36.2) Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (0.27.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (1.17.0) Downloading streamlit_webrtc-0.63.4-py3-none-any.whl (216 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 216.9/216.9 kB 5.7 MB/s eta 0:00:00 Downloading aioice-0.10.1-py3-none-any.whl (24 kB) Downloading aiortc-1.13.0-py3-none-any.whl (92 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 92.9/92.9 kB 8.9 MB/s eta 0:00:00 Downloading av-14.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.5 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.5/35.5 MB 35.1 MB/s eta 0:00:00 Downloading cryptography-45.0.6-cp311-abi3-manylinux_2_34_x86_64.whl (4.5 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.5/4.5 MB 103.4 MB/s eta 0:00:00 Downloading dnspython-2.7.0-py3-none-any.whl (313 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 313.6/313.6 kB 23.1 MB/s eta 0:00:00 Downloading ifaddr-0.2.0-py3-none-any.whl (12 kB) Downloading pyee-13.0.0-py3-none-any.whl (15 kB) Downloading pylibsrtp-0.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 78.5 MB/s eta 0:00:00 Downloading pyopenssl-25.1.0-py3-none-any.whl (56 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.8/56.8 kB 4.2 MB/s eta 0:00:00 Installing collected packages: ifaddr, pyee, dnspython, av, pylibsrtp, cryptography, aioice, pyopenssl, aiortc, streamlit-webrtc Attempting uninstall: cryptography Found existing installation: cryptography 43.0.3 Uninstalling cryptography-43.0.3: Successfully uninstalled cryptography-43.0.3 Attempting uninstall: pyopenssl Found existing installation: pyOpenSSL 24.2.1 Uninstalling pyOpenSSL-24.2.1: Successfully uninstalled pyOpenSSL-24.2.1 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. pydrive2 1.21.3 requires cryptography<44, but you have cryptography 45.0.6 which is incompatible. pydrive2 1.21.3 requires pyOpenSSL<=24.2.1,>=19.1.0, but you have pyopenssl 25.1.0 which is incompatible. Successfully installed aioice-0.10.1 aiortc-1.13.0 av-14.4.0 cryptography-45.0.6 dnspython-2.7.0 ifaddr-0.2.0 pyee-13.0.0 pylibsrtp-0.12.0 pyopenssl-25.1.0 streamlit-webrtc-0.63.4 🚀 Streamlit is starting... 🌐 Your app should be running at: 📱 Check for a popup or notification from Colab about port forwarding 🔗 Or try: https://colab.research.google.com/drive/your-notebook#scrollTo=your-cell
# Kill all streamlit processes and free up the port
!pkill -f streamlit
!pkill -f "port 8501"
!fuser -k 8501/tcp # Force kill anything on port 8501
import time
time.sleep(3)
print("✅ Port 8501 should now be free")
✅ Port 8501 should now be free
# Create requirements.txt
requirements = """
streamlit
pandas
numpy
scikit-learn
plotly
seaborn
matplotlib
"""
with open('requirements.txt', 'w') as f:
f.write(requirements.strip())
print("✅ requirements.txt created")
print("📄 Contents:")
print(requirements)
✅ requirements.txt created 📄 Contents: streamlit pandas numpy scikit-learn plotly seaborn matplotlib
📄 view_my_model.py 📄 requirements.txt 📄 loan_default_risk_predictor.py 🔽 You'll need to download these files: ['view_my_model.py', 'requirements.txt', 'loan_default_risk_predictor.py']
The error KeyError: 'repayment_curr_ratio' occurs because the column repayment_curr_ratio does not exist in your DataFrame df_final.
Based on the previous steps, it looks like you intended to calculate some features related to the current loan. I will add the necessary code to calculate these features (interest_curr_amount, interest_curr_rate, and repayment_curr_ratio) before calculating the repayment_efficiency and late_payment_rate.
Now that the columns are corrected, here are the descriptive statistics and the correlation matrix of your data.
Next, you could consider building a predictive model to assess loan risk.
Now that we have visualized and explored the data, the next step is to prepare it for modeling. This involves encoding the categorical features and splitting the data into training and testing sets.
Would you like me to proceed with that?